From 1a3275a4c0feeb905bb6e6f8a82a3e874a061f71 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Sep 2025 16:26:57 +0100 Subject: [PATCH 1/7] Figuring out delivery... --- crates/illumos-sys-hdrs/src/kernel.rs | 2 + crates/opte-api/src/lib.rs | 2 +- crates/opte-api/src/mac.rs | 5 + lib/opte/src/ddi/mblk.rs | 60 +++++++ xde/src/dev_map.rs | 57 ++++++- xde/src/xde.rs | 223 ++++++++++++++++---------- 6 files changed, 259 insertions(+), 90 deletions(-) diff --git a/crates/illumos-sys-hdrs/src/kernel.rs b/crates/illumos-sys-hdrs/src/kernel.rs index 9ac0c26b..c0d854d4 100644 --- a/crates/illumos-sys-hdrs/src/kernel.rs +++ b/crates/illumos-sys-hdrs/src/kernel.rs @@ -500,6 +500,8 @@ unsafe extern "C" { pub fn freemsg(mp: *mut mblk_t); pub fn freemsgchain(mp: *mut mblk_t); + pub fn msgpullup(mp: *mut mblk_t, n_bytes: isize) -> *mut mblk_t; + pub fn gethrtime() -> hrtime_t; pub fn getmajor(dev: dev_t) -> major_t; diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 7176e7a5..558a6e41 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 37; +pub const API_VERSION: u64 = 38; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/crates/opte-api/src/mac.rs b/crates/opte-api/src/mac.rs index 1818a997..728774de 100644 --- a/crates/opte-api/src/mac.rs +++ b/crates/opte-api/src/mac.rs @@ -55,6 +55,11 @@ impl MacAddr { pub const fn from_const(bytes: [u8; 6]) -> Self { Self { inner: bytes } } + + /// Return whether this MAC address is broadcast/multicast. + pub const fn is_broadcast(&self) -> bool { + (self.inner[0] & 0b0000_0001) != 0 + } } impl From for smoltcp::wire::EthernetAddress { diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index e6bce52f..9cd4cada 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -16,6 +16,7 @@ use core::cmp::Ordering; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; +use core::num::NonZeroUsize; use core::ops::Deref; use core::ops::DerefMut; use core::ptr; @@ -300,6 +301,45 @@ impl MsgBlk { out } + /// Copy the first `n` bytes of this packet into a new `mblk_t`, + /// increasing the refcount of all remaining segments. + /// + /// On non-illumos platforms this will simple clone the underlying packet + /// with the desired segmentation. + pub fn pullup( + &self, + n: Option, + ) -> Result { + let totlen = self.byte_len(); + + if let Some(n) = n + && n.get() > totlen + { + return Err(PktPullupError::TooLong); + } + + cfg_if! { + if #[cfg(all(not(feature = "std"), not(test)))] { + let out = unsafe { + ddi::msgpullup( + self.0.as_ptr(), + n.map(|v| v.get() as isize).unwrap_or(-1), + ) + }; + + let mp = NonNull::new(out) + .ok_or(PktPullupError::AllocFailed)?; + + Ok(Self(mp)) + } else { + // We aren't (currently?) simulating refcount tracking in our + // userland mblk abstraction. + // Do the segmentation right, but otherwise it's fully cloned. + todo!() + } + } + } + /// Creates a new [`MsgBlk`] using a given set of packet headers. pub fn new_pkt(emit: impl Emit + EmitDoesNotRelyOnBufContents) -> Self { let mut pkt = Self::new(emit.packet_length()); @@ -1034,6 +1074,26 @@ impl core::fmt::Display for PktInfoError { } } +/// Reasons a [`MsgBlk`] could not be pulled up. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] +pub enum PktPullupError { + /// Requested pullup was longer than the underlying packet. + TooLong, + /// The OS was unable to allocate a [`MsgBlk`]. + AllocFailed, +} + +impl core::error::Error for PktPullupError {} + +impl core::fmt::Display for PktPullupError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(match self { + Self::TooLong => "requested pullup is longer than packet", + Self::AllocFailed => "failed to allocate an mblk_t", + }) + } +} + /// Counts the number of segments in an `mblk_t` from `head`, linked /// via `b_cont`. unsafe fn count_mblk_chain(mut head: Option>) -> usize { diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 5fbdf5c0..4cc00f50 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -7,9 +7,13 @@ use crate::postbox::Postbox; use crate::xde::XdeDev; use alloc::collections::btree_map::BTreeMap; +use alloc::collections::btree_map::Entry; +use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; +use opte::api::Ipv6Addr; use opte::api::MacAddr; +use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; @@ -41,6 +45,7 @@ type Dev = Arc; pub struct DevMap { devs: BTreeMap, names: BTreeMap, + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -51,7 +56,11 @@ impl Default for DevMap { impl DevMap { pub const fn new() -> Self { - Self { devs: BTreeMap::new(), names: BTreeMap::new() } + Self { + devs: BTreeMap::new(), + names: BTreeMap::new(), + mcast_groups: BTreeMap::new(), + } } /// Insert an `XdeDev`. @@ -69,6 +78,52 @@ impl DevMap { self.devs.remove(&key) } + /// Allow a port to receive on a given multicast group. + /// + /// This takes the overlay (outer v6) multicast group address. + pub fn multicast_subscribe( + &mut self, + name: &str, + mcast_ip: Ipv6Addr, + ) -> Result<(), OpteError> { + let port = self + .get_by_name(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + // TODO: probably could store Arcs or Weaks here, but want to be safe for now. + self.mcast_groups.entry(mcast_ip).or_default().insert(key); + + Ok(()) + } + + /// Rescind a port's ability to receive on a given multicast group. + pub fn multicast_unsubscribe( + &mut self, + name: &str, + mcast_ip: Ipv6Addr, + ) -> Result<(), OpteError> { + let port = self + .get_by_name(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + // TODO: Do we need handling for a special VNI from rack-external traffic? + if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_ip) { + set.into_mut().remove(&key); + } + + Ok(()) + } + + /// Find the keys for all ports who want to receive a given multicast packet. + pub fn multicast_listeners( + &self, + mcast_ip: &Ipv6Addr, + ) -> Option> { + self.mcast_groups.get(mcast_ip).map(|v| v.iter()) + } + /// Return a reference to an `XdeDev` using its address. #[inline] #[must_use] diff --git a/xde/src/xde.rs b/xde/src/xde.rs index b753484a..5910e39f 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -160,6 +160,7 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::ffi::CStr; use core::num::NonZeroU32; +use core::num::NonZeroUsize; use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; @@ -185,6 +186,7 @@ use opte::api::DumpUftReq; use opte::api::DumpUftResp; use opte::api::ListLayersReq; use opte::api::ListLayersResp; +use opte::api::MacAddr; use opte::api::NoResp; use opte::api::OpteCmd; use opte::api::OpteCmdIoctl; @@ -1772,20 +1774,18 @@ fn guest_loopback_probe( fn guest_loopback( src_dev: &XdeDev, - entry_state: &DevMap, + dest_dev: &XdeDev, + port_key: VniMac, mut pkt: MsgBlk, - vni: Vni, postbox: &mut TxPostbox, ) { use Direction::*; let mblk_addr = pkt.mblk_addr(); - // Loopback now requires a reparse on loopback to account for UFT fastpath. - // When viona serves us larger packets, we needn't worry about allocing - // the encap on. - // We might be able to do better in the interim, but that costs us time. - + // Loopback requires a reparse to account for UFT fastpath. + // We might be able to do better, but the logistics in passing around + // the emitspec in lieu of 'full' metadata might be a little troublesome. let parsed_pkt = match Packet::parse_inbound(pkt.iter_mut(), VpcParser {}) { Ok(pkt) => pkt, Err(e) => { @@ -1810,76 +1810,56 @@ fn guest_loopback( let flow = parsed_pkt.flow(); - let ether_dst = parsed_pkt.meta().inner_eth.destination(); - let port_key = VniMac::new(vni, ether_dst); - let maybe_dest_dev = entry_state.get_by_key(port_key); - - match maybe_dest_dev { - Some(dest_dev) => { - guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - - // We have found a matching Port on this host; "loop back" - // the packet into the inbound processing path of the - // destination Port. - match dest_dev.port.process(In, parsed_pkt) { - Ok(ProcessResult::Modified(emit_spec)) => { - let mut pkt = emit_spec.apply(pkt); - if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { - opte::engine::err!("failed to set offload info: {}", e); - } + guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - // Having advertised offloads to our guest, looped back - // packets are liable to have zero-checksums. Fill these - // if necessary. - let pkt = if pkt - .offload_flags() - .flags - .intersects(MblkOffloadFlags::HCK_TX_FLAGS) - { - // We have only asked for cksum emulation, so we - // will either have: - // * 0 pkts (checksum could not be emulated, - // packet dropped) - // * 1 pkt. - mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) - .and_then(|mut v| v.pop_front()) - } else { - Some(pkt) - }; - - if let Some(pkt) = pkt { - postbox.post_local(port_key, pkt); - } - } - - Ok(ProcessResult::Drop { reason }) => { - opte::engine::dbg!("loopback rx drop: {:?}", reason); - } + match dest_dev.port.process(In, parsed_pkt) { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut pkt = emit_spec.apply(pkt); + if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } - Ok(ProcessResult::Hairpin(_hppkt)) => { - // There should be no reason for an loopback - // inbound packet to generate a hairpin response - // from the destination port. - opte::engine::dbg!("unexpected loopback rx hairpin"); - } + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .offload_flags() + .flags + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + // We have only asked for cksum emulation, so we + // will either have: + // * 0 pkts (checksum could not be emulated, + // packet dropped) + // * 1 pkt. + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|mut v| v.pop_front()) + } else { + Some(pkt) + }; - Err(e) => { - opte::engine::dbg!( - "loopback port process error: {} -> {} {:?}", - src_dev.port.name(), - dest_dev.port.name(), - e - ); - } + if let Some(pkt) = pkt { + postbox.post_local(port_key, pkt); } } - None => { + Ok(ProcessResult::Drop { reason }) => { + opte::engine::dbg!("loopback rx drop: {:?}", reason); + } + + Ok(ProcessResult::Hairpin(_hppkt)) => { + // There should be no reason for an loopback + // inbound packet to generate a hairpin response + // from the destination port. + opte::engine::dbg!("unexpected loopback rx hairpin"); + } + + Err(e) => { opte::engine::dbg!( - "underlay dest is same as src but the Port was not found \ - vni = {}, mac = {}", - vni.as_u32(), - ether_dst + "loopback port process error: {} -> {} {:?}", + src_dev.port.name(), + dest_dev.port.name(), + e ); } } @@ -2039,24 +2019,34 @@ fn xde_mc_tx_one<'a>( // If the outer IPv6 destination is the same as the // source, then we need to loop the packet inbound to the // guest on this same host. - let (ip6_src, ip6_dst) = match emit_spec.outer_ip6_addrs() { - Some(v) => v, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no outer IPv6 header, dropping"); - return; - } + let Some((ip6_src, ip6_dst)) = emit_spec.outer_ip6_addrs() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no outer IPv6 header, dropping"); + return; }; - let vni = match emit_spec.outer_encap_vni() { - Some(vni) => vni, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no geneve header, dropping"); - return; - } + // EmitSpec applies pushes/pops, but modifications will have occurred + // by this point. Pull destination MAC to allow us to reuse code + // between unicast & multicast loopback. + // + // Ingot will have asserted that Ethernet came first, and that it was + // contiguous. + let Some(ether_dst) = pkt + .get(..size_of::()) + .map(|v| MacAddr::from_const(v.try_into().unwrap())) + else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("couldn't re-read inner MAC, dropping"); + return; + }; + + let Some(vni) = emit_spec.outer_encap_vni() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no geneve header, dropping"); + return; }; let Some(tun_meoi) = emit_spec.encap_meoi() else { @@ -2074,7 +2064,21 @@ fn xde_mc_tx_one<'a>( if ip6_src == ip6_dst { let entry_state = entry_state.get_or_insert_with(|| src_dev.port_map.read()); - guest_loopback(src_dev, entry_state, out_pkt, vni, postbox); + + let key = VniMac::new(vni, ether_dst); + if let Some(dest_dev) = entry_state.get_by_key(key) { + // We have found a matching Port on this host; "loop back" + // the packet into the inbound processing path of the + // destination Port. + guest_loopback(src_dev, dest_dev, key, out_pkt, postbox); + } else { + opte::engine::dbg!( + "underlay dest is same as src but the Port was not found \ + vni = {}, mac = {}", + vni.as_u32(), + ether_dst + ); + } return; } @@ -2086,6 +2090,47 @@ fn xde_mc_tx_one<'a>( return; }; + // For a multicast outbound frame, we need to attempt to deliver + // to all relevant local ports *and* over whichever underlay ports are + // required. + if ip6_dst.is_multicast() { + // TODO: fill in the mcast forwarding flags using The Table. + let entry_state = + entry_state.get_or_insert_with(|| src_dev.port_map.read()); + if let Some(others) = entry_state.multicast_listeners(&ip6_dst) + { + let my_key = VniMac::new(vni, src_dev.port.mac_addr()); + for el in others { + if my_key == *el { + continue; + } + + // This is a more lightweight clone in illumos, and + // gives us an owned form of the headers but a ref + // counted clone of the packet body. + // + // If there are any body transforms internally, OPTE + // will fully clone out the contents if required. + let Ok(my_pkt) = out_pkt.pullup(NonZeroUsize::new( + (encap_len as usize) + + (non_eth_payl_bytes as usize) + + Ethernet::MINIMUM_LENGTH, + )) else { + continue; + }; + match entry_state.get_by_key(*el) { + Some(dev) => guest_loopback( + src_dev, dev, *el, my_pkt, postbox, + ), + None => { + // TODO: log, error count, etc. + // Stale state caused this (probably) + } + } + } + } + } + // 'MSS boosting' is performed here -- we set a 9k (minus overheads) // MSS for compatible TCP traffic. This is a kind of 'pseudo-GRO', // sending larger frames internally rather than having the NIC/OS @@ -2454,7 +2499,7 @@ unsafe extern "C" fn xde_rx( head } -/// Processes an individual packet receiver on the underlay device `stream`. +/// Processes an individual packet received on the underlay device `stream`. /// /// This function returns any input `pkt` which is not of interest to XDE (e.g., /// the packet is not Geneve over v6, or no matching OPTE port could be found). @@ -2490,6 +2535,8 @@ fn xde_rx_one( let meta = parsed_pkt.meta(); let old_len = parsed_pkt.len(); + let is_mcast = meta.outer_eth.destination().is_broadcast(); + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { From 6494ebcfabf200c3ff9ce3aeeabd52c1c24440be Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Sep 2025 17:34:36 +0100 Subject: [PATCH 2/7] Write a userland implementation of `msgpullup` --- lib/opte/src/ddi/mblk.rs | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 9cd4cada..6b89aff8 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -304,7 +304,7 @@ impl MsgBlk { /// Copy the first `n` bytes of this packet into a new `mblk_t`, /// increasing the refcount of all remaining segments. /// - /// On non-illumos platforms this will simple clone the underlying packet + /// On non-kernel platforms this will simple clone the underlying packet /// with the desired segmentation. pub fn pullup( &self, @@ -315,6 +315,8 @@ impl MsgBlk { if let Some(n) = n && n.get() > totlen { + // The DDI function will bail out if this is the case, but + // we'll be none the wiser to *what* the failure mode was. return Err(PktPullupError::TooLong); } @@ -332,10 +334,31 @@ impl MsgBlk { Ok(Self(mp)) } else { - // We aren't (currently?) simulating refcount tracking in our - // userland mblk abstraction. + // We aren't (currently?) simulating refcount tracking at all + // in our userland mblk abstraction. // Do the segmentation right, but otherwise it's fully cloned. - todo!() + let to_ensure = n.map(|v| v.get()).unwrap_or(totlen); + let mut top_mblk = MsgBlk::new(to_ensure); + let mut still_to_write = to_ensure; + + for chunk in self.iter() { + let mut left_in_chunk = chunk.len(); + let to_take = chunk.len().min(still_to_write); + + if still_to_write != 0 { + top_mblk.write_bytes_back(&chunk[..to_take]) + .expect("to_take should be <= remaining capacity"); + } + + still_to_write -= to_take; + left_in_chunk -= to_take; + + if left_in_chunk != 0 { + top_mblk.append(MsgBlk::copy(&chunk[to_take..])); + } + } + + Ok(top_mblk) } } } From 080bfe2168e49b8cf43b881ae135eb005a6bf6bb Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 4 Sep 2025 11:37:03 +0100 Subject: [PATCH 3/7] Delivery in Rx path (test) --- xde/src/xde.rs | 155 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 5910e39f..5a714a1f 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -214,6 +214,7 @@ use opte::engine::geneve::Vni; use opte::engine::geneve::WalkOptions; use opte::engine::headers::IpAddr; use opte::engine::ip::v6::Ipv6Addr; +use opte::engine::ip::v6::Ipv6Ref; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; @@ -2503,6 +2504,13 @@ unsafe extern "C" fn xde_rx( /// /// This function returns any input `pkt` which is not of interest to XDE (e.g., /// the packet is not Geneve over v6, or no matching OPTE port could be found). +/// +/// `xde_rx_one_direct` largely replicates this function due to lifetime issues +/// around parsing, so changes here may need to be made there too. We could do this +/// with a single function using an `enum` control parameter (e.g., +/// `DoMcastCheck(&DevMap)`, `DeliverDirect(&XdeDev, VniMac)`) but we'd be +/// really reliant on rustc interpreting these as static choices and inlining +/// accordingly. #[inline] fn xde_rx_one( stream: &DlsStream, @@ -2535,7 +2543,41 @@ fn xde_rx_one( let meta = parsed_pkt.meta(); let old_len = parsed_pkt.len(); - let is_mcast = meta.outer_eth.destination().is_broadcast(); + let ip6_dst = meta.outer_v6.destination(); + if ip6_dst.is_multicast() + && let Some(ports) = devs.multicast_listeners(&ip6_dst) + { + let pullup_len = ( + &meta.outer_eth, + &meta.outer_v6, + &meta.outer_udp, + &meta.outer_encap, + &meta.inner_eth, + &meta.inner_l3, + &meta.inner_ulp, + ) + .packet_length(); + drop(parsed_pkt); + + for el in ports { + // As explained in `xde_mc_tx_one`, this is cheaper than a full + // packet copy and should be safe to process even in the presence + // of body transforms. + let Ok(my_pkt) = pkt.pullup(NonZeroUsize::new(pullup_len)) else { + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox) + } + None => { + // TODO: log, error count, etc. + // Stale state caused this (probably) + } + } + } + return None; + } let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, @@ -2642,6 +2684,117 @@ fn xde_rx_one( None } +/// Processes an individual packet after multicast replication has taken place. +/// This primarily duplicates `xde_rx_one`. +/// +/// Lifetimes (arond Packet etc.) will make this difficult to simplify +/// the expression of both this and its original implementation. We could insert +/// the body using macros, but then we really lose a lot (line numbers on crash, +/// subpar rust-analyzer integration)... +#[inline] +fn xde_rx_one_direct( + stream: &DlsStream, + dev: &XdeDev, + port_key: VniMac, + mut pkt: MsgBlk, + postbox: &mut Postbox, +) { + // TODO: it would be great if we could tell Ingot 'here are all the + // layer lengths/types, please believe that they are correct'. And then + // to plumb that through `NetworkParser`. I can't say that I *like* + // doing this reparse here post-replication. + let parser = VpcParser {}; + let parsed_pkt = Packet::parse_inbound(pkt.iter_mut(), parser) + .expect("this is a reparse of a known-valid packet"); + + let meta = parsed_pkt.meta(); + let old_len = parsed_pkt.len(); + + let ulp_meoi = match meta.ulp_meoi(old_len) { + Ok(ulp_meoi) => ulp_meoi, + Err(e) => { + opte::engine::dbg!("{}", e); + return; + } + }; + + let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) + + u32::from(ulp_meoi.meoi_l3hlen) + + u32::from(ulp_meoi.meoi_l4hlen); + + // Large TCP frames include their MSS in-band, as recipients can require + // this to correctly process frames which have been given split into + // larger chunks. + // + // This will be set to a nonzero value when TSO has been asked of the + // source packet. + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let recovered_mss = if is_tcp { + let mut out = None; + for opt in WalkOptions::from_raw(&meta.outer_encap) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Mss(el)) = opt.option.known() { + out = NonZeroU32::new(el.mss()); + break; + } + } + out + } else { + None + }; + + // We are in passthrough mode, skip OPTE processing. + if dev.passthrough { + drop(parsed_pkt); + postbox.post(port_key, pkt); + return; + } + + let port = &dev.port; + + let res = port.process(Direction::In, parsed_pkt); + + match res { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut npkt = emit_spec.apply(pkt); + let len = npkt.byte_len(); + let pay_len = len + - usize::try_from(non_payl_bytes) + .expect("usize > 32b on x86_64"); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if let Some(mss) = recovered_mss + // This packet could be the last segment of a split frame at + // which point it could be smaller than the original MSS. + // Don't re-tag the MSS if so, as guests may be confused and + // MAC emulation will reject the packet if the guest does not + // support GRO. + && pay_len > usize::try_from(mss.get()).expect("usize > 32b on x86_64") + { + npkt.request_offload(MblkOffloadFlags::HW_LSO, mss.get()); + } + + if let Err(e) = npkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } + + postbox.post(port_key, npkt); + } + Ok(ProcessResult::Hairpin(hppkt)) => { + stream.tx_drop_on_no_desc( + hppkt, + TxHint::NoneOrMixed, + MacTxFlags::empty(), + ); + } + _ => {} + } +} + #[unsafe(no_mangle)] fn add_router_entry_hdlr(env: &mut IoctlEnvelope) -> Result { let req: AddRouterEntryReq = env.copy_in_req()?; From 6e07cce4510c4b1266e3a84dde009f8af3a52375 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 4 Sep 2025 15:09:06 +0100 Subject: [PATCH 4/7] Add a table of reachable mcast entries in a VNI Also pushes on the requisite extensions for us to fill in --- crates/opte-api/src/ip.rs | 13 ++ lib/opte-test-utils/src/lib.rs | 5 +- lib/oxide-vpc/src/api.rs | 2 + lib/oxide-vpc/src/engine/overlay.rs | 270 +++++++++++++++++++++------- xde/src/xde.rs | 8 +- 5 files changed, 226 insertions(+), 72 deletions(-) diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 20fffaaa..4f8c8fc7 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -307,6 +307,15 @@ pub enum IpAddr { Ip6(Ipv6Addr), } +impl IpAddr { + pub const fn is_multicast(&self) -> bool { + match self { + IpAddr::Ip4(v4) => v4.is_multicast(), + IpAddr::Ip6(v6) => v6.is_multicast(), + } + } +} + impl From for IpAddr { fn from(ipv4: Ipv4Addr) -> Self { IpAddr::Ip4(ipv4) @@ -431,6 +440,10 @@ impl Ipv4Addr { // u32. u32::from_be_bytes(self.bytes()).to_be() } + + pub const fn is_multicast(&self) -> bool { + matches!(self.inner[0], 224..240) + } } impl From for Ipv4Addr { diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index a4f3cb7b..118892e8 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -84,6 +84,7 @@ pub use oxide_vpc::engine::gateway; pub use oxide_vpc::engine::geneve::OxideOptionType; pub use oxide_vpc::engine::nat; pub use oxide_vpc::engine::overlay; +pub use oxide_vpc::engine::overlay::PerVniMaps; pub use oxide_vpc::engine::overlay::TUNNEL_ENDPOINT_MAC; pub use oxide_vpc::engine::overlay::Virt2Boundary; pub use oxide_vpc::engine::overlay::Virt2Phys; @@ -253,7 +254,7 @@ fn oxide_net_builder( name: &str, cfg: &oxide_vpc::cfg::VpcCfg, vpc_map: Arc, - v2p: Arc, + vni_state: Arc, v2b: Arc, ) -> PortBuilder { #[allow(clippy::arc_with_non_send_sync)] @@ -272,7 +273,7 @@ fn oxide_net_builder( .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, v2p, v2b, one_limit) + overlay::setup(&pb, cfg, vni_state, v2b, one_limit) .expect("failed to add overlay layer"); pb } diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index b1e82e62..cba09c68 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -529,6 +529,8 @@ pub struct VpcMapResp { pub vni: Vni, pub ip4: Vec<(Ipv4Addr, GuestPhysAddr)>, pub ip6: Vec<(Ipv6Addr, GuestPhysAddr)>, + pub mcast_ip4: Vec<(Ipv4Addr, Ipv6Addr)>, + pub mcast_ip6: Vec<(Ipv6Addr, Ipv6Addr)>, } #[derive(Debug, Deserialize, Serialize)] diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 5149416a..b4df96b1 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -80,7 +80,7 @@ pub const OVERLAY_LAYER_NAME: &str = "overlay"; pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, - v2p: Arc, + vni_state: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -88,7 +88,7 @@ pub fn setup( let encap = Action::Static(Arc::new(EncapAction::new( cfg.phys_ip, cfg.vni, - v2p, + vni_state, v2b, ))); @@ -182,7 +182,7 @@ pub struct EncapAction { // sending data. phys_ip_src: Ipv6Addr, vni: Vni, - v2p: Arc, + vni_state: Arc, v2b: Arc, } @@ -190,10 +190,10 @@ impl EncapAction { pub fn new( phys_ip_src: Ipv6Addr, vni: Vni, - v2p: Arc, + vni_state: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, v2p, v2b } + Self { phys_ip_src, vni, vni_state, v2b } } } @@ -241,35 +241,68 @@ impl StaticAction for EncapAction { } }; - let (is_internal, phys_target) = match target { + let (is_internal, phys_target, is_mcast) = match target { RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&flow_id.dst_ip()) { - Some(phys) => { - // Hash the packet onto a route target. This is a very - // rudimentary mechanism. Should level-up to an ECMP - // algorithm with well known statistical properties. - let hash = f_hash as usize; - let target = match phys.iter().nth(hash % phys.len()) { - Some(target) => target, - None => return Ok(AllowOrDeny::Deny), - }; - ( - false, + // TODO: Is landing mcast traffic in here right? My intuition says + // so atm, given that the address will be outside of the individual + // VPC subnets, and mcast send will apply outbound NAT (and we expect + // such frames could well leave the rack)! + // This may need a new RouterTargetInternal? And/or thought about the + // interaction w/ routers? + let dst_ip = flow_id.dst_ip(); + if dst_ip.is_multicast() { + match self.vni_state.m2p.get(&dst_ip) { + Some(phys) => ( + true, PhysNet { - ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), - ip: target.ip, - vni: target.vni, + ether: phys.dest_mac(), + ip: phys.0, + vni: self.vni, }, - ) + true, + ), + + // Landing here implies we don't yet have an internal forwarding + // address for this multicast group, or this VNI does not have + // access to it. + None => return Ok(AllowOrDeny::Deny), + } + } else { + match self.v2b.get(&dst_ip) { + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = f_hash as usize; + let target = + match phys.iter().nth(hash % phys.len()) { + Some(target) => target, + None => return Ok(AllowOrDeny::Deny), + }; + ( + false, + PhysNet { + ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), + ip: target.ip, + vni: target.vni, + }, + false, + ) + } + None => return Ok(AllowOrDeny::Deny), } - None => return Ok(AllowOrDeny::Deny), } } - RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { + RouterTargetInternal::Ip(virt_ip) => match self + .vni_state + .v2p + .get(&virt_ip) + { Some(phys) => ( true, PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni }, + false, ), // The router target has specified a VPC IP we do not @@ -290,7 +323,7 @@ impl StaticAction for EncapAction { }, RouterTargetInternal::VpcSubnet(_) => { - match self.v2p.get(&flow_id.dst_ip()) { + match self.vni_state.v2p.get(&flow_id.dst_ip()) { Some(phys) => ( true, PhysNet { @@ -298,6 +331,7 @@ impl StaticAction for EncapAction { ip: phys.ip, vni: self.vni, }, + false, ), // The guest is attempting to contact a VPC IP we @@ -330,13 +364,25 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; + static GENEVE_MCAST_OPT_BODY: &[u8] = &[0; size_of::()]; + static GENEVE_MCAST_OPT: ArbitraryGeneveOption = + ArbitraryGeneveOption { + option_class: GENEVE_OPT_CLASS_OXIDE, + option_type: OxideOptionType::Multicast as u8, + data: Cow::Borrowed(GENEVE_MCAST_OPT_BODY), + }; + + let outer_mac = + if is_mcast { phys_target.ether } else { MacAddr::ZERO }; + let tfrm = HdrTransform { name: ENCAP_NAME.to_string(), // We leave the outer src/dst up to the driver. + // In the multicast case we can, however, derive this. outer_ether: HeaderAction::Push( Valid::validated(EtherMeta { + dst: outer_mac, src: MacAddr::ZERO, - dst: MacAddr::ZERO, ether_type: EtherType::Ipv6, }) .expect("Ethernet validation is infallible"), @@ -369,30 +415,45 @@ impl StaticAction for EncapAction { EncapPush::from(GenevePush { vni: phys_target.vni, entropy: flow_id.crc32() as u16, - // Allocate space in which we can include the TCP MSS, when - // needed during MSS boosting. It's theoretically doable to - // gate this on seeing an unexpectedly high/low MSS option - // in the TCP handshake, but there are problems in doing so: - // * The MSS for the flow is negotiated, but the UFT entry - // containing this transform does not know the other side. - // * UFT invalidation means we may rerun this transform in - // the middle of a flow. - // So, emit it unconditionally for VPC-internal TCP traffic, - // which could need the original MSS to be carried when LSO - // is in use. - options: if pkt_meta.is_inner_tcp() && is_internal { - Cow::Borrowed(core::slice::from_ref( + options: match ( + pkt_meta.is_inner_tcp() && is_internal, + is_mcast, + ) { + // Allocate space in which we can include the TCP MSS, when + // needed during MSS boosting. It's theoretically doable to + // gate this on seeing an unexpectedly high/low MSS option + // in the TCP handshake, but there are problems in doing so: + // * The MSS for the flow is negotiated, but the UFT entry + // containing this transform does not know the other side. + // * UFT invalidation means we may rerun this transform in + // the middle of a flow. + // So, emit it unconditionally for VPC-internal TCP traffic, + // which could need the original MSS to be carried when LSO + // is in use. + (true, false) => Cow::Borrowed(core::slice::from_ref( &GENEVE_MSS_SIZE_OPT, - )) - } else { - Cow::Borrowed(&[]) + )), + (false, true) => Cow::Borrowed(core::slice::from_ref( + &GENEVE_MCAST_OPT, + )), + (false, false) => Cow::Borrowed(&[]), + // TCP is not exactly multicast compatible. + (true, true) => { + return Ok(AllowOrDeny::Deny); + } }, }), )?), - inner_ether: HeaderAction::Modify(EtherMod { - dst: Some(phys_target.ether), - ..Default::default() - }), + // For multicast packets, the inner destination MAC should already + // correspond to the inner L3 destination address. + inner_ether: if is_mcast { + HeaderAction::Ignore + } else { + HeaderAction::Modify(EtherMod { + dst: Some(phys_target.ether), + ..Default::default() + }) + }, ..Default::default() }; @@ -483,31 +544,22 @@ impl StaticAction for DecapAction { } pub struct VpcMappings { - inner: KMutex>>, + inner: KMutex>>, } impl VpcMappings { /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. - pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { + pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { // We convert to GuestPhysAddr because it saves us from // redundant storage of the VNI. let guest_phys = GuestPhysAddr::from(phys); let mut lock = self.inner.lock(); - match lock.get(&phys.vni) { - Some(v2p) => { - v2p.set(vip, guest_phys); - v2p.clone() - } + let state = lock.entry(phys.vni).or_default(); + state.v2p.set(vip, guest_phys); - None => { - let v2p = Arc::new(Virt2Phys::new()); - v2p.set(vip, guest_phys); - lock.insert(phys.vni, v2p.clone()); - v2p - } - } + state.clone() } /// Delete the mapping for the given VIP in the given VNI. @@ -515,7 +567,7 @@ impl VpcMappings { /// Return the existing entry, if there is one. pub fn del(&self, vip: &IpAddr, phys: &PhysNet) -> Option { match self.inner.lock().get(&phys.vni) { - Some(v2p) => v2p.remove(vip).map(|guest_phys| PhysNet { + Some(state) => state.v2p.remove(vip).map(|guest_phys| PhysNet { ether: guest_phys.ether, ip: guest_phys.ip, vni: phys.vni, @@ -530,11 +582,13 @@ impl VpcMappings { let mut mappings = Vec::new(); let lock = self.inner.lock(); - for (vni, v2p) in lock.iter() { + for (vni, state) in lock.iter() { mappings.push(VpcMapResp { vni: *vni, - ip4: v2p.dump_ip4(), - ip6: v2p.dump_ip6(), + ip4: state.v2p.dump_ip4(), + ip6: state.v2p.dump_ip6(), + mcast_ip4: state.m2p.dump_ip4(), + mcast_ip6: state.m2p.dump_ip6(), }); } @@ -548,8 +602,8 @@ impl VpcMappings { /// assumption is enforced by the control plane; making sure that /// peered VPCs do not overlap their VIP ranges. pub fn ip_to_vni(&self, vip: &IpAddr) -> Option { - for (vni, v2p) in self.inner.lock().iter() { - if v2p.get(vip).is_some() { + for (vni, state) in self.inner.lock().iter() { + if state.v2p.get(vip).is_some() { return Some(*vni); } } @@ -568,6 +622,10 @@ impl Default for VpcMappings { } } +// XXX: Should these not be RwLocks? This is a really unfortunate degree of +// contention for multiple ports in the slowpath to block one another. +// (Not common by any means, but needless when it does occur!) + /// A mapping from virtual IPs to physical location. pub struct Virt2Phys { // XXX We need to implement some sort of invalidation mechanism @@ -606,6 +664,21 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } +// XXX Isn't this really just a V2P mapping, without a guest MAC? +/// A mapping from inner multicast destination IPs to underlay multicast groups. +pub struct Mcast2Phys { + // XXX In theory this is vulnerable to the same concerns around validation + // as `Virt2Phys`. + ip4: KMutex>, + ip6: KMutex>, +} + +#[derive(Default)] +pub struct PerVniMaps { + pub v2p: Virt2Phys, + pub m2p: Mcast2Phys, +} + pub const TUNNEL_ENDPOINT_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { @@ -828,3 +901,68 @@ impl MappingResource for Virt2Phys { } } } + +impl Mcast2Phys { + pub fn new() -> Self { + Self { + ip4: KMutex::new(BTreeMap::new()), + ip6: KMutex::new(BTreeMap::new()), + } + } + + pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { + self.ip4.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + } + + pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { + self.ip6.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + } +} + +impl Default for Mcast2Phys { + fn default() -> Self { + Self::new() + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct MulticastUnderlay(Ipv6Addr); + +impl MulticastUnderlay { + pub fn new(addr: Ipv6Addr) -> Option { + if addr.is_multicast() { Some(Self(addr)) } else { None } + } + + fn dest_mac(&self) -> MacAddr { + self.0.unchecked_multicast_mac() + } +} + +impl Resource for Mcast2Phys {} +impl ResourceEntry for MulticastUnderlay {} + +impl MappingResource for Mcast2Phys { + type Key = IpAddr; + type Entry = MulticastUnderlay; + + fn get(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().get(ip4).cloned(), + IpAddr::Ip6(ip6) => self.ip6.lock().get(ip6).cloned(), + } + } + + fn remove(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().remove(ip4), + IpAddr::Ip6(ip6) => self.ip6.lock().remove(ip6), + } + } + + fn set(&self, vip: Self::Key, mcast: Self::Entry) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().insert(ip4, mcast), + IpAddr::Ip6(ip6) => self.ip6.lock().insert(ip6, mcast), + } + } +} diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 5a714a1f..9ce7b6de 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -447,7 +447,7 @@ pub struct XdeDev { // However, that's not where things are today. pub port: Arc>, vpc_cfg: VpcCfg, - port_v2p: Arc, + port_vni_state: Arc, // Pass the packets through to the underlay devices, skipping // opte-core processing. @@ -964,7 +964,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { state.ectx.clone(), &req.dhcp, )?, - port_v2p, + port_vni_state: port_v2p, vni: cfg.vni, vpc_cfg: cfg, passthrough: req.passthrough, @@ -2379,7 +2379,7 @@ fn new_port( name: String, cfg: &VpcCfg, vpc_map: Arc, - v2p: Arc, + vni_state: Arc, v2b: Arc, ectx: Arc, dhcp_cfg: &DhcpCfg, @@ -2402,7 +2402,7 @@ fn new_port( gateway::setup(&pb, &cfg, vpc_map, FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, v2p, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, vni_state, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual From 6746d725c7aa231c63d88af33b53a5c4cd397a51 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Mon, 13 Oct 2025 04:59:03 +0000 Subject: [PATCH 5/7] [mcast] work through OPTE mcast support This implements IPv4 and IPv6 multicast packet forwarding with three replication modes (External, Underlay, All) for rack-wide multicast delivery across VPCs. Includes: - M2P (Multicast-to-Physical) mappings with admin-scoped IPv6 underlay - Per-port multicast group subscriptions for local delivery - Multicast forwarding table with configurable replication strategies - Geneve multicast option encoding for delivery mode signaling - RX path loop prevention (packets marked Underlay skip re-relay) - TX/RX path integration with flow table and encapsulation - DTrace probes for multicast delivery observability - API addition: set_mcast_fwd/clear_mcast_fwd for forwarding table management - API addition: mcast_subscribe/mcast_unsubscribe for port group membership - API addition: dump_mcast_fwd for observability - Testing: XDE integration tests covering all replication modes, validation, and edge cases - Testing: oxide-vpc integration tests for Geneve encapsulation and parsing - Enforce DEFAULT_MULTICAST_VNI (77) for all multicast traffic (groups are fleet-side/cross-VPC) and validate admin-scoped underlay addresses (ff04::/16, ff05::/16, ff08::/16). --- .github/buildomat/jobs/test.sh | 12 + .github/buildomat/jobs/xde.sh | 21 + .gitignore | 3 +- Cargo.lock | 1 + bin/opteadm/src/bin/opteadm.rs | 54 ++ crates/opte-api/src/cmd.rs | 66 +- crates/opte-api/src/ip.rs | 60 ++ dtrace/opte-mcast-delivery.d | 97 ++ lib/opte-ioctl/src/lib.rs | 59 ++ lib/opte-test-utils/src/geneve_verify.rs | 183 ++++ lib/opte-test-utils/src/lib.rs | 21 +- lib/opte/README.adoc | 38 +- lib/oxide-vpc/src/api.rs | 221 +++++ lib/oxide-vpc/src/engine/gateway/mod.rs | 53 +- lib/oxide-vpc/src/engine/geneve.rs | 182 +++- lib/oxide-vpc/src/engine/overlay.rs | 229 +++-- lib/oxide-vpc/src/engine/router.rs | 41 + lib/oxide-vpc/src/print.rs | 36 + lib/oxide-vpc/tests/integration_tests.rs | 345 ++++++- xde-tests/Cargo.toml | 1 + xde-tests/src/lib.rs | 542 ++++++++++- xde-tests/tests/loopback.rs | 2 +- xde-tests/tests/multicast_multi_sub.rs | 363 +++++++ xde-tests/tests/multicast_rx.rs | 514 ++++++++++ xde-tests/tests/multicast_validation.rs | 239 +++++ xde/src/dev_map.rs | 41 +- xde/src/stats.rs | 63 ++ xde/src/xde.rs | 1121 ++++++++++++++++++++-- 28 files changed, 4384 insertions(+), 224 deletions(-) create mode 100644 dtrace/opte-mcast-delivery.d create mode 100644 lib/opte-test-utils/src/geneve_verify.rs create mode 100644 xde-tests/tests/multicast_multi_sub.rs create mode 100644 xde-tests/tests/multicast_rx.rs create mode 100644 xde-tests/tests/multicast_validation.rs diff --git a/.github/buildomat/jobs/test.sh b/.github/buildomat/jobs/test.sh index 236234a0..59a62dae 100755 --- a/.github/buildomat/jobs/test.sh +++ b/.github/buildomat/jobs/test.sh @@ -82,3 +82,15 @@ pfexec add_drv xde banner "test" pfexec chmod +x /input/xde/work/test/loopback pfexec /input/xde/work/test/loopback --nocapture + +# Multicast tests must run with --test-threads=1 because they share +# hardcoded device names (xde_test_sim0/1, xde_test_vnic0/1) that conflict +# when tests run in parallel +pfexec chmod +x /input/xde/work/test/multicast_rx +pfexec /input/xde/work/test/multicast_rx --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_multi_sub +pfexec /input/xde/work/test/multicast_multi_sub --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_validation +pfexec /input/xde/work/test/multicast_validation --nocapture --test-threads=1 diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index 3abe2881..82baf11c 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -14,6 +14,9 @@ #: "=/work/release/xde_link.so", #: "=/work/release/xde_link.so.sha256", #: "=/work/test/loopback", +#: "=/work/test/multicast_rx", +#: "=/work/test/multicast_multi_sub", +#: "=/work/test/multicast_validation", #: "=/work/xde.conf", #: ] #: @@ -116,5 +119,23 @@ loopback_test=$( cargo build -q --test loopback --message-format=json |\ jq -r "select(.profile.test == true) | .filenames[]" ) +cargo build --test multicast_rx +multicast_rx_test=$( + cargo build -q --test multicast_rx --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_multi_sub +multicast_multi_sub_test=$( + cargo build -q --test multicast_multi_sub --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_validation +multicast_validation_test=$( + cargo build -q --test multicast_validation --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) mkdir -p /work/test cp $loopback_test /work/test/loopback +cp $multicast_rx_test /work/test/multicast_rx +cp $multicast_multi_sub_test /work/test/multicast_multi_sub +cp $multicast_validation_test /work/test/multicast_validation diff --git a/.gitignore b/.gitignore index f82d74c0..5956d6b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.html target download -.DS_STORE +scripts +.DS_STORE \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index c3fb0628..743f68c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2972,6 +2972,7 @@ dependencies = [ "anyhow", "libnet", "opte-ioctl", + "opte-test-utils", "oxide-vpc", "rand", "slog", diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 219bf555..706b14a4 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -27,8 +27,10 @@ use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; use oxide_vpc::api::BOUNDARY_SERVICES_VNI; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DhcpCfg; @@ -39,22 +41,26 @@ use oxide_vpc::api::FirewallRule; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Cfg; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::Ports; use oxide_vpc::api::ProtoFilter; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::TunnelEndpoint; use oxide_vpc::api::VpcCfg; +use oxide_vpc::print::print_mcast_fwd; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; use std::io; @@ -225,6 +231,31 @@ enum Command { /// Clear a virtual-to-boundary mapping ClearV2B { prefix: IpCidr, tunnel_endpoint: Vec }, + /// Set a multicast forwarding entry + SetMcastFwd { + /// The multicast group address (IPv4 or IPv6) + group: IpAddr, + /// Next hop IPv6 address + next_hop_addr: Ipv6Addr, + /// Next hop VNI (defaults to fleet-level DEFAULT_MULTICAST_VNI) + #[arg(default_value_t = Vni::new(DEFAULT_MULTICAST_VNI).unwrap())] + next_hop_vni: Vni, + /// Delivery mode (replication): + /// - external: local guests in same VNI + /// - underlay: infrastructure via underlay multicast + /// - all: both local and underlay + replication: Replication, + }, + + /// Clear a multicast forwarding entry + ClearMcastFwd { + /// The multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Dump the multicast forwarding table + DumpMcastFwd, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -764,6 +795,29 @@ fn main() -> anyhow::Result<()> { hdl.clear_v2b(&req)?; } + Command::SetMcastFwd { + group, + next_hop_addr, + next_hop_vni, + replication, + } => { + let next_hop = NextHopV6::new(next_hop_addr, next_hop_vni); + let req = SetMcastForwardingReq { + group, + next_hops: vec![(next_hop, replication)], + }; + hdl.set_mcast_fwd(&req)?; + } + + Command::ClearMcastFwd { group } => { + let req = ClearMcastForwardingReq { group }; + hdl.clear_mcast_fwd(&req)?; + } + + Command::DumpMcastFwd => { + print_mcast_fwd(&hdl.dump_mcast_fwd()?)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index 5c0f9986..de507062 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -25,31 +25,38 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx + ListPorts = 1, // list all ports + AddFwRule = 20, // add firewall rule + RemFwRule = 21, // remove firewall rule + SetFwRules = 22, // set/replace all firewall rules at once + DumpTcpFlows = 30, // dump TCP flows + DumpLayer = 31, // dump the specified Layer + DumpUft = 32, // dump the Unified Flow Table + ListLayers = 33, // list the layers on a given port + ClearUft = 40, // clear the UFT + ClearLft = 41, // clear the given Layer's Flow Table + SetVirt2Phys = 50, // set a v2p mapping + DumpVirt2Phys = 51, // dump the v2p mappings + SetVirt2Boundary = 52, // set a v2b mapping + ClearVirt2Boundary = 53, // clear a v2b mapping + DumpVirt2Boundary = 54, // dump the v2b mappings + ClearVirt2Phys = 55, // clear a v2p mapping + AddRouterEntry = 60, // add a router entry for IP dest + DelRouterEntry = 61, // remove a router entry for IP dest + CreateXde = 70, // create a new xde device + DeleteXde = 71, // delete an xde device + SetXdeUnderlay = 72, // set xde underlay devices + ClearXdeUnderlay = 73, // clear xde underlay devices + SetExternalIps = 80, // set xde external IPs for a port + AllowCidr = 90, // allow ip block through gateway tx/rx + RemoveCidr = 91, // deny ip block through gateway tx/rx + SetMcastForwarding = 100, // set multicast forwarding entries + ClearMcastForwarding = 101, // clear multicast forwarding entries + DumpMcastForwarding = 102, // dump multicast forwarding table + McastSubscribe = 103, // subscribe a port to a multicast group + McastUnsubscribe = 104, // unsubscribe a port from a multicast group + SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) + ClearMcast2Phys = 106, // clear M2P mapping } impl TryFrom for OpteCmd { @@ -82,6 +89,13 @@ impl TryFrom for OpteCmd { 80 => Ok(Self::SetExternalIps), 90 => Ok(Self::AllowCidr), 91 => Ok(Self::RemoveCidr), + 100 => Ok(Self::SetMcastForwarding), + 101 => Ok(Self::ClearMcastForwarding), + 102 => Ok(Self::DumpMcastForwarding), + 103 => Ok(Self::McastSubscribe), + 104 => Ok(Self::McastUnsubscribe), + 105 => Ok(Self::SetMcast2Phys), + 106 => Ok(Self::ClearMcast2Phys), _ => Err(()), } } @@ -177,6 +191,7 @@ pub enum OpteError { dest: IpCidr, target: String, }, + InvalidUnderlayMulticast(String), LayerNotFound(String), MacExists { port: String, @@ -230,6 +245,7 @@ impl OpteError { Self::DeserCmdReq(_) => ENOMSG, Self::FlowExists(_) => EEXIST, Self::InvalidRouterEntry { .. } => EINVAL, + Self::InvalidUnderlayMulticast(_) => EINVAL, Self::LayerNotFound(_) => ENOENT, Self::MacExists { .. } => EEXIST, Self::MaxCapacity(_) => ENFILE, diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 4f8c8fc7..3da20d9c 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -653,6 +653,24 @@ impl Ipv6Addr { self.inner[0] == 0xFF } + /// Return `true` if this is a multicast IPv6 address with administrative scope + /// (admin-local, site-local, or organization-local) as defined in RFC 4291 and RFC 7346. + /// + /// The three administrative scopes are: + /// - `0x4`: admin-local scope + /// - `0x5`: site-local scope + /// - `0x8`: organization-local scope + pub const fn is_admin_scoped_multicast(&self) -> bool { + if !self.is_multicast() { + return false; + } + + // Extract the scope field from the lower 4 bits of the second byte + // (first byte is 0xFF for all multicast, second byte contains flags and scope) + let scope = self.inner[1] & 0x0F; + matches!(scope, 0x4 | 0x5 | 0x8) + } + /// Return the bytes of the address. pub fn bytes(&self) -> [u8; 16] { self.inner @@ -1002,6 +1020,12 @@ impl Display for Ipv4Cidr { } impl Ipv4Cidr { + /// IPv4 multicast address range, `224.0.0.0/4`. + pub const MCAST: Self = Self { + ip: Ipv4Addr::from_const([224, 0, 0, 0]), + prefix_len: Ipv4PrefixLen(4), + }; + pub fn ip(&self) -> Ipv4Addr { self.parts().0 } @@ -1159,6 +1183,24 @@ impl Ipv6Cidr { prefix_len: Ipv6PrefixLen(64), }; + /// IPv6 admin-local multicast scope prefix, `ff04::/16`. + pub const MCAST_ADMIN_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff04, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + + /// IPv6 site-local multicast scope prefix, `ff05::/16`. + pub const MCAST_SITE_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff05, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + + /// IPv6 organization-local multicast scope prefix, `ff08::/16`. + pub const MCAST_ORG_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff08, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + pub fn new(ip: Ipv6Addr, prefix_len: Ipv6PrefixLen) -> Self { let ip = ip.safe_mask(prefix_len); Ipv6Cidr { ip, prefix_len } @@ -1481,6 +1523,24 @@ mod test { assert_eq!(addr.solicited_node_multicast(), expected); } + #[test] + fn test_ipv6_admin_scoped_multicast() { + // Test the three valid administrative scopes + assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); // admin-local (0x4) + assert!(to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local (0x5) + assert!(to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local (0x8) + + // Test non-admin scoped multicast addresses + assert!(!to_ipv6("ff01::1").is_admin_scoped_multicast()); // interface-local + assert!(!to_ipv6("ff02::1").is_admin_scoped_multicast()); // link-local + assert!(!to_ipv6("ff0e::1").is_admin_scoped_multicast()); // global + + // Test non-multicast addresses + assert!(!to_ipv6("fd00::1").is_admin_scoped_multicast()); // ULA + assert!(!to_ipv6("fe80::1").is_admin_scoped_multicast()); // link-local unicast + assert!(!to_ipv6("2001:db8::1").is_admin_scoped_multicast()); // global unicast + } + #[test] fn dhcp_fqdn() { let no_host = DhcpCfg { hostname: None, ..Default::default() }; diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d new file mode 100644 index 00000000..4924012a --- /dev/null +++ b/dtrace/opte-mcast-delivery.d @@ -0,0 +1,97 @@ +/* + * Track multicast packet delivery. + * + * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + */ +#include "common.h" + +#define HDR_FMT "%-8s %-6s %-39s %-20s %-10s\n" +#define LINE_FMT "%-8s %-6d %-39s %-20s %-10s\n" + +BEGIN { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; +} + +sdt:xde::mcast-tx { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->repl = arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->repl_str = (this->repl == 0) ? "External" : + (this->repl == 1) ? "Underlay" : + (this->repl == 2) ? "All" : "Unknown"; + printf(LINE_FMT, "TX", this->vni, this->group_str, "-", this->repl_str); + num++; +} + +sdt:xde::mcast-rx { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->repl = arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->repl_str = (this->repl == 0) ? "External" : + (this->repl == 1) ? "Underlay" : + (this->repl == 2) ? "All" : "Unknown"; + printf(LINE_FMT, "RX", this->vni, this->group_str, "-", this->repl_str); + num++; +} + +sdt:xde::mcast-local-delivery { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=port */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->port = stringof(arg3); + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + printf(LINE_FMT, "DELIVER", this->vni, this->group_str, this->port, "-"); + num++; +} + +sdt:xde::mcast-underlay-fwd { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=next_hop */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->next_hop = (in6_addr_t *)arg3; + + if (num >= 10) { + printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + num = 0; + } + + this->group_str = (this->af == AF_INET) ? + inet_ntoa((ipaddr_t *)this->group_ptr) : + inet_ntoa6((in6_addr_t *)this->group_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop); + printf(LINE_FMT, "UNDERLAY", this->vni, this->group_str, this->next_hop_str, "-"); + num++; +} diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index c896ce4b..26fd831f 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -27,6 +27,8 @@ use opte::api::XDE_IOC_OPTE_CMD; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AllowCidrReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; @@ -34,15 +36,20 @@ use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrReq; use oxide_vpc::api::RemoveCidrResp; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::VpcCfg; @@ -205,6 +212,16 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) } + pub fn set_m2p(&self, req: &SetMcast2PhysReq) -> Result { + let cmd = OpteCmd::SetMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + pub fn clear_m2p(&self, req: &ClearMcast2PhysReq) -> Result { + let cmd = OpteCmd::ClearMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + pub fn set_v2b(&self, req: &SetVirt2BoundaryReq) -> Result { let cmd = OpteCmd::SetVirt2Boundary; run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) @@ -224,6 +241,48 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) } + /// Set a multicast forwarding entry. + pub fn set_mcast_fwd( + &self, + req: &SetMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::SetMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Clear a multicast forwarding entry. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::ClearMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Dump the multicast forwarding table. + pub fn dump_mcast_fwd(&self) -> Result { + let cmd = OpteCmd::DumpMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( + &self, + req: &McastSubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastSubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + /// Set xde underlay devices. pub fn set_xde_underlay( &self, diff --git a/lib/opte-test-utils/src/geneve_verify.rs b/lib/opte-test-utils/src/geneve_verify.rs new file mode 100644 index 00000000..9a510548 --- /dev/null +++ b/lib/opte-test-utils/src/geneve_verify.rs @@ -0,0 +1,183 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Module to parse and verify Geneve headers from snoop hex output. +//! +//! This uses the existing OPTE/ingot Geneve types to parse raw packet bytes +//! and extract key multicast-related fields for test assertions. + +use opte::engine::geneve::Vni; +use opte::engine::ip::v6::Ipv6Ref; +use opte::engine::parse::ValidGeneveOverV6; +use opte::ingot::geneve::GeneveRef; +use opte::ingot::types::HeaderParse; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Replication; +use oxide_vpc::engine::geneve::extract_multicast_replication; + +/// Parsed Geneve header information for test verification. +pub struct GeneveInfo { + pub vni: Vni, + pub outer_ipv6_dst: Ipv6Addr, + pub replication: Option, +} + +/// Parse a Geneve/IPv6 packet from raw bytes and extract multicast-related +/// fields. +/// +/// Returns VNI, outer IPv6 destination, and replication mode from Geneve +/// options. +pub fn parse_geneve_packet(bytes: &[u8]) -> Result { + let (pkt, _, _) = ValidGeneveOverV6::parse(bytes) + .map_err(|e| format!("Failed to parse Geneve/IPv6 packet: {e:?}"))?; + + let vni = pkt.outer_encap.vni(); + let outer_ipv6_dst = pkt.outer_v6.destination(); + let replication = extract_multicast_replication(&pkt.outer_encap); + + Ok(GeneveInfo { vni, outer_ipv6_dst, replication }) +} + +/// Parse hex string from snoop output into bytes. +/// +/// Snoop output with `-x0` flag is hex digits without separators: +/// "ffffffffffff001122334455..." +pub fn parse_snoop_hex(hex_str: &str) -> Result, String> { + hex_str + .as_bytes() + .chunks(2) + .map(|chunk| { + let hex_byte = std::str::from_utf8(chunk) + .map_err(|e| format!("Invalid UTF-8: {e}"))?; + u8::from_str_radix(hex_byte, 16) + .map_err(|e| format!("Invalid hex: {e}")) + }) + .collect() +} + +/// Extract snoop hex output from command output. +/// +/// We support common `snoop -P -x0` formats: +/// - Lines of contiguous hex digits (with or without spaces). +/// - Hex dumps with an offset prefix like `0:` or `0000:` followed by +/// groups of hex digits (2/4/8/16 chars). +/// +/// To avoid false positives from summary lines (e.g., "UDP port 6081"), the +/// tokenized fallback triggers only for lines that look like offset-prefixed +/// hex dumps. +pub fn extract_snoop_hex(snoop_output: &str) -> Result { + let mut hex_bytes = String::new(); + + for line in snoop_output.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.contains("Using device") { + continue; + } + + // Case 1: entire line is hex digits + whitespace (e.g., "aa bb cc ..." or + // single long line of hex). Remove whitespace and append. + if trimmed.chars().all(|c| c.is_ascii_hexdigit() || c.is_whitespace()) { + for ch in trimmed.chars().filter(|c| c.is_ascii_hexdigit()) { + hex_bytes.push(ch); + } + continue; + } + + // Case 2: offset-prefixed hexdump lines (e.g., "0: 4500 003c ..."). + // Only consider tokenized parsing if the first token looks like an + // offset (decimal or hex) ending with a ':' to avoid pulling numbers + // from summary lines. + let mut tokens = trimmed.split_whitespace(); + let Some(first) = tokens.next() else { continue }; + if !first.ends_with(':') { + continue; // Not a hexdump line + } + let mut off = first.trim_end_matches(':'); + if off.starts_with("0x") || off.starts_with("0X") { + off = &off[2..]; + } + if !off.chars().all(|c| c.is_ascii_hexdigit()) { + continue; // Not a valid offset + } + + for tok in tokens { + let mut t = tok.trim_end_matches(':'); + if t.len() > 2 && (t.starts_with("0x") || t.starts_with("0X")) { + t = &t[2..]; + } + if t.is_empty() { + continue; + } + // Accept groups commonly used in dumps: bytes (2), words (4), dwords (8), + // or qwords (16). Ignore anything else to avoid accidental matches. + let len = t.len(); + if matches!(len, 2 | 4 | 8 | 16) + && t.chars().all(|c| c.is_ascii_hexdigit()) + { + hex_bytes.push_str(t); + } + } + } + + if hex_bytes.is_empty() { + return Err("No hex data found in snoop output".to_string()); + } + + // Ensure even number of nibbles to form complete bytes. + if hex_bytes.len() % 2 == 1 { + hex_bytes.pop(); + } + + Ok(hex_bytes) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_contiguous_hex() { + let input = "deadbeefCAFEBABE"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "deadbeefCAFEBABE"); + let bytes = parse_snoop_hex(&out).unwrap(); + assert_eq!(bytes, vec![0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe]); + } + + #[test] + fn extract_bytes_with_spaces() { + let input = "45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_words() { + let input = "0: 4500 003c 1c46 4000"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_bytes() { + let input = "0: 45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn ignore_summary_numbers() { + let input = r#" +Using device xde_test_sim1 (promiscuous) +UDP: fe80::1 > ff04::224.1.2.3, port 6081 +0: 4500 003c 1c46 4000 +"#; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + // Should not accidentally include "6081" + assert!(!out.contains("6081")); + } +} diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index 118892e8..efbf2a0d 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -10,6 +10,7 @@ #![allow(dead_code)] pub mod dhcp; +pub mod geneve_verify; pub mod icmp; pub mod pcap; #[macro_use] @@ -269,11 +270,11 @@ fn oxide_net_builder( let dhcp = base_dhcp_config(); firewall::setup(&mut pb, fw_limit).expect("failed to add firewall layer"); - gateway::setup(&pb, cfg, vpc_map, fw_limit, &dhcp) + gateway::setup(&pb, cfg, vpc_map.clone(), fw_limit, &dhcp) .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, vni_state, v2b, one_limit) + overlay::setup(&pb, cfg, vni_state, vpc_map.clone(), v2b, one_limit) .expect("failed to add overlay layer"); pb } @@ -384,10 +385,12 @@ pub fn oxide_net_setup2( let mut updates = vec![ // * Epoch starts at 1, adding router entry bumps it to 2. "set:epoch=2", - // * Allow inbound IPv6 traffic for guest. - // * Allow inbound IPv4 traffic for guest. + // * Allow inbound IPv4 unicast traffic for guest. + // * Allow inbound IPv4 multicast traffic for guest. + // * Allow inbound IPv6 unicast traffic for guest. + // * Allow inbound IPv6 multicast traffic for guest. // * Deny inbound NDP for guest. - "set:gateway.rules.in=3", + "set:gateway.rules.in=5", // IPv4 // ---- // @@ -395,7 +398,8 @@ pub fn oxide_net_setup2( // * ICMP Echo Reply for Gateway // * DHCP Offer // * DHCP Ack - // * Outbound traffic from Guest IP + MAC address + // * Outbound unicast traffic from Guest IP + MAC address + // * Outbound multicast traffic from Guest IP + MAC address // // IPv6 // ---- @@ -406,8 +410,9 @@ pub fn oxide_net_setup2( // * ICMPv6 Echo Reply for Gateway from Guest Link-Local // * ICMPv6 Echo Reply for Gateway from Guest VPC ULA // * DHCPv6 - // * Outbound traffic from Guest IPv6 + MAC Address - "set:gateway.rules.out=12", + // * Outbound unicast traffic from Guest IPv6 + MAC Address + // * Outbound multicast traffic from Guest IPv6 + MAC Address + "set:gateway.rules.out=14", // * Allow all outbound traffic "set:firewall.rules.out=0", // * Outbound IPv4 SNAT diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 3bf6fe79..97f19242 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -209,11 +209,47 @@ resources. Pausing, Saving, & Restoring:: A port may be paused, saved, and restored for the purpose of live migration. The pausing of a state allows it to halt all packet processing and quiesce to a steady state. -In this state is is then possible to save the port's state which has +In this state it is then possible to save the port's state which has all data needed to restart the port without rebuilding the entire flow state. This is achieved by restoring the port based on some payload of save data. +=== Multicast Model + +OPTE implements multicast consistent with the rack networking +architecture described in [RFD 63](https://rfd.shared.oxide.computer/rfd/0063) +and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). Key points: + +Fleet VNI:: All multicast traffic uses a single fleet‑level Geneve VNI +(`DEFAULT_MULTICAST_VNI`, currently `77`) rather than per‑tenant VNIs. +Mappings from overlay multicast groups to underlay multicast addresses +are stored and validated under this VNI. (See `RFD 488` for the rationale behind +fleet-level VNI.) + +Delivery Modes (Replication):: The Oxide Geneve multicast option carries +the delivery mode as a 2‑bit field in the top two bits of the option +body's first byte: + +* External — local guest delivery within the same VNI: OPTE decapsulates + and delivers to all local subscribers (guests) on the port map. +* Underlay — infrastructure delivery: OPTE sends Geneve‑encapsulated + packets towards the configured underlay multicast address in fleet + VNI 77. The underlay performs any further replication. +* All — both behaviors above. + +Encapsulation Path:: The overlay layer sets `External` in the multicast +option on initial encapsulation. XDE uses its multicast forwarding table +to decide whether to additionally forward to underlay next hops, and, if +so, marks those forwarded copies as `Underlay` or `All` to prevent +re‑relay at downstream receivers. + +Constraints & Validation:: + +* M2P (multicast‑to‑physical) mappings must use `DEFAULT_MULTICAST_VNI`. +* Any next hop that causes underlay forwarding must specify VNI 77. +* Underlay multicast addresses must be IPv6 admin‑scoped (e.g., + `ff04::/16`, `ff05::/16`, `ff08::/16`). + === Layers The main function of the port is to process packets in a flow-based diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index cba09c68..14f443e8 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,6 +20,81 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; +/// Multicast packet replication strategy. +/// +/// Encoding and scope: +/// - The Geneve Oxide multicast option encodes replication in the top 2 bits +/// of the option body’s first byte (u2). The remaining 30 bits are reserved. +/// - External means local customer-facing delivery within the same VNI +/// - Underlay means Geneve-encapsulated forwarding to underlay infrastructure +/// members using the fleet multicast VNI. +/// - All combines both behaviors. +/// +/// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) +/// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). +#[derive( + Clone, Copy, Debug, Default, Serialize, Deserialize, Eq, PartialEq, Hash, +)] +#[repr(u8)] +pub enum Replication { + /// Replicate packets to external/customer-facing members (guest instances). + /// + /// Local delivery within the same VNI. Packets are decapsulated at the + /// switch before delivery to guests. + #[default] + External = 0x00, + /// Replicate packets to underlay/infrastructure members. + /// + /// Forwards Geneve-encapsulated packets to underlay destinations for + /// infrastructure delivery (not directly to guest instances). Uses + /// DEFAULT_MULTICAST_VNI (77) for encapsulation. + Underlay = 0x01, + /// Replicate packets to both external and underlay members (bifurcated). + /// + /// Combines both customer-facing (decapsulated to guests) and infrastructure + /// (encapsulated) delivery modes for comprehensive multicast distribution. + All = 0x02, + /// Reserved for future use. This value exists to account for all possible + /// values in the 2-bit Geneve option field. + Reserved = 0x03, +} + +impl Replication { + /// Merge two replication strategies, preferring the most permissive. + /// + /// Merging rules: + /// - Any `All` -> `All` + /// - `External` + `Underlay` -> `All` + /// - Same values -> keep the value + /// - Default to `All` for unexpected combinations + pub const fn merge(self, other: Self) -> Self { + match (self, other) { + (Self::All, _) | (_, Self::All) => Self::All, + (Self::External, Self::Underlay) + | (Self::Underlay, Self::External) => Self::All, + (a, b) if a as u8 == b as u8 => a, + // Prefer `All` for unexpected combinations + _ => Self::All, + } + } +} + +#[cfg(any(feature = "std", test))] +impl FromStr for Replication { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "external" => Ok(Self::External), + "underlay" => Ok(Self::Underlay), + "all" => Ok(Self::All), + lower => Err(format!( + "unexpected replication type {lower} -- expected 'external', 'underlay', or 'all'" + )), + } + } +} + /// This is the MAC address that OPTE uses to act as the virtual gateway. pub const GW_MAC_ADDR: MacAddr = MacAddr::from_const([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]); @@ -27,6 +102,19 @@ pub const GW_MAC_ADDR: MacAddr = /// tunnel endpoint. pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; +/// Default VNI for rack-wide multicast groups (no VPC association). +/// Must match Omicron's DEFAULT_MULTICAST_VNI. +/// +/// This is the only VNI currently supported for multicast traffic. +/// All multicast groups (M2P mappings and forwarding entries) must use this VNI. +/// OPTE validates that multicast operations specify this VNI and rejects others. +/// +/// **Security model:** While M2P (Multicast-to-Physical) mappings are stored +/// per-VNI in the code, the enforcement of DEFAULT_MULTICAST_VNI means all +/// multicast traffic shares a single namespace across the rack, with no +/// VPC-level isolation (as multicast groups are fleet-wide). +pub const DEFAULT_MULTICAST_VNI: u32 = 77u32; + /// Description of Boundary Services, the endpoint used to route traffic /// to external networks. // @@ -303,6 +391,44 @@ pub struct PhysNet { pub vni: Vni, } +/// Represents an IPv6 next hop for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, +)] +pub struct NextHopV6 { + /// The IPv6 address of the next hop + pub addr: Ipv6Addr, + /// The VNI to use for this next hop + pub vni: Vni, +} + +impl NextHopV6 { + pub fn new(addr: Ipv6Addr, vni: Vni) -> Self { + Self { addr, vni } + } +} + +/// A next hop for multicast forwarding (supports both IPv4 and IPv6). +#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct NextHop { + /// The IP address of the next hop + pub addr: IpAddr, + /// The VNI to use for this next hop + pub vni: Vni, +} + +impl NextHop { + pub fn new(addr: IpAddr, vni: Vni) -> Self { + Self { addr, vni } + } +} + +impl From for NextHop { + fn from(v6: NextHopV6) -> Self { + Self { addr: v6.addr.into(), vni: v6.vni } + } +} + /// A Geneve tunnel endpoint. #[derive(Clone, Copy, Debug, Deserialize, Serialize)] pub struct TunnelEndpoint { @@ -366,12 +492,18 @@ impl From for GuestPhysAddr { /// abstraction, it's simply allowing one subnet to talk to another. /// There is no separate VPC router process, the real routing is done /// by the underlay. +/// +/// * Multicast: Packets matching this entry are multicast traffic. +/// Uses the M2P (Multicast-to-Physical) mapping to determine underlay +/// destinations. Does not apply SNAT; the outer IPv6 underlay source +/// is the physical IP. #[derive(Clone, Debug, Copy, Deserialize, Serialize)] pub enum RouterTarget { Drop, InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), + Multicast(IpCidr), } #[cfg(any(feature = "std", test))] @@ -403,6 +535,15 @@ impl FromStr for RouterTarget { cidr6s.parse().map(|x| Self::VpcSubnet(IpCidr::Ip6(x))) } + Some(("mcast4", cidr4s)) => { + let cidr4 = cidr4s.parse()?; + Ok(Self::Multicast(IpCidr::Ip4(cidr4))) + } + + Some(("mcast6", cidr6s)) => { + cidr6s.parse().map(|x| Self::Multicast(IpCidr::Ip6(x))) + } + Some(("ig", uuid)) => Ok(Self::InternetGateway(Some( uuid.parse::().map_err(|e| e.to_string())?, ))), @@ -423,6 +564,12 @@ impl Display for RouterTarget { Self::Ip(IpAddr::Ip6(ip6)) => write!(f, "ip6={ip6}"), Self::VpcSubnet(IpCidr::Ip4(sub4)) => write!(f, "sub4={sub4}"), Self::VpcSubnet(IpCidr::Ip6(sub6)) => write!(f, "sub6={sub6}"), + Self::Multicast(IpCidr::Ip4(mcast4)) => { + write!(f, "mcast4={mcast4}") + } + Self::Multicast(IpCidr::Ip6(mcast6)) => { + write!(f, "mcast6={mcast6}") + } } } } @@ -567,6 +714,28 @@ pub struct ClearVirt2PhysReq { pub phys: PhysNet, } +/// Set mapping from multicast group to underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address + pub underlay: Ipv6Addr, + /// VNI for this mapping + pub vni: Vni, +} + +/// Clear a mapping from multicast group to underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address + pub underlay: Ipv6Addr, + /// VNI for this mapping + pub vni: Vni, +} + /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetVirt2BoundaryReq { @@ -607,8 +776,60 @@ pub enum DelRouterEntryResp { NotFound, } +/// Set multicast forwarding entries for a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcastForwardingReq { + /// The multicast group address (overlay) + pub group: IpAddr, + /// The next hops (underlay IPv6 addresses) with replication information + pub next_hops: Vec<(NextHopV6, Replication)>, +} + +/// Clear multicast forwarding entries for a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcastForwardingReq { + /// The multicast group address + pub group: IpAddr, +} + +/// Response for dumping the multicast forwarding table. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastForwardingResp { + /// The multicast forwarding table entries + pub entries: Vec, +} + +impl CmdOk for DumpMcastForwardingResp {} + +/// A single multicast forwarding table entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastForwardingEntry { + /// The multicast group address (overlay) + pub group: IpAddr, + /// The next hops (underlay IPv6 addresses) with replication information + pub next_hops: Vec<(NextHopV6, Replication)>, +} + impl opte::api::cmd::CmdOk for DelRouterEntryResp {} +/// Subscribe a port to a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscribeReq { + /// The port name to subscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe a port from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeReq { + /// The port name to unsubscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index eb2c3b44..74ff34bc 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -56,6 +56,8 @@ use opte::api::Direction; use opte::api::OpteError; use opte::engine::ether::EtherMod; use opte::engine::headers::HeaderAction; +use opte::engine::ip::v4::Ipv4Cidr; +use opte::engine::ip::v6::Ipv6Cidr; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; @@ -173,7 +175,7 @@ fn setup_ipv4( let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -196,6 +198,27 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Multicast prefixes (224.0.0.0/4) + let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; + + // Outbound multicast - allow from guest's MAC to multicast destinations + let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); + mcast_out.add_predicate(Predicate::InnerDstIp4(ipv4_mcast.clone())); + mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ + EtherAddrMatch::Exact(cfg.guest_mac), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + + // Inbound multicast - allow multicast destinations to guest + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } @@ -209,7 +232,7 @@ fn setup_ipv6( icmpv6::setup(layer, cfg, ip_cfg)?; dhcpv6::setup(layer, cfg, dhcp_cfg)?; let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -232,6 +255,32 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Admin-/site-/org-scoped multicast prefixes (for underlay forwarding) + let admin_mcast_prefixes = vec![ + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ADMIN_LOCAL), + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_SITE_LOCAL), + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ORG_LOCAL), + ]; + + // Outbound multicast - allow from guest's MAC to multicast destinations + let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); + mcast_out + .add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes.clone())); + mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ + EtherAddrMatch::Exact(cfg.guest_mac), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + + // Inbound multicast - allow multicast destinations to guest + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes)); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index f22ed8c6..0cb18be6 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -5,7 +5,72 @@ // Copyright 2025 Oxide Computer Company //! Geneve option types specific to the Oxide VPC dataplane. - +//! +//! # Oxide Geneve Options +//! +//! This module defines Geneve options used in the Oxide rack network to carry +//! VPC-specific metadata during packet encapsulation. All options use the Oxide +//! option class (`GENEVE_OPT_CLASS_OXIDE` = 0x0129). +//! +//! ## Option Types +//! +//! - **External** (0x00): Indicates a packet originated from outside the rack +//! and was encapsulated by the switch NAT ingress path with Geneve wrapping. +//! OPTE decapsulates before delivering to the guest. +//! - **Multicast** (0x01): Carries multicast replication strategy as a 2-bit +//! field for coordinating delivery between OPTE and sidecar switch logic. +//! - **Mss** (0x02): Carries original TCP MSS for MSS clamping/boosting to +//! prevent MTU issues during underlay encapsulation. +//! +//! ## Multicast Option Encoding +//! +//! The multicast option uses a compact 2-bit encoding aligned with sidecar.p4's +//! processing constraints: +//! +//! ```text +//! Option body (4 bytes): +//! ┌──────────┬────────────────────────────┐ +//! │ Bits 7-6 │ Bits 5-0 + remaining bytes │ +//! │ (u2) │ (reserved, must be 0) │ +//! └──────────┴────────────────────────────┘ +//! │ +//! └─> Replication mode: +//! 00 = External (local guest delivery) +//! 01 = Underlay (infrastructure forwarding) +//! 10 = All (both External and Underlay) +//! 11 = Reserved +//! ``` +//! +//! ### Replication Semantics +//! +//! - **External**: Packet should be decapsulated and delivered to local guest +//! instances subscribed to this multicast group. Switch sets `nat_egress_hit` +//! to trigger decapsulation before delivery. +//! - **Underlay**: Packet should remain encapsulated and forwarded to underlay +//! infrastructure destinations. +//! - **All**: Bifurcated delivery to both local guests (decapsulated) and +//! underlay destinations (encapsulated). +//! +//! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) +//! regardless of replication mode. The replication mode determines delivery behavior, +//! not VNI selection. +//! +//! The 2-bit encoding allows efficient extraction in P4 programs without complex +//! parsing, aligning with the sidecar pipeline's tag-based routing decisions. +//! +//! ## Option Length Encoding +//! +//! Geneve has two length fields to consider (both measured in 4-byte words): +//! - Geneve header `opt_len` (6 bits): total size of the options area +//! (sums each option's 4-byte header + body). +//! - Option header `len` (5 bits): size of that option's body only. +//! +//! For Oxide options used here: +//! - External: geneve opt_len += 1; option len = 0 +//! - Multicast: geneve opt_len += 2; option len = 1 +//! - MSS: geneve opt_len += 2; option len = 1 + +use crate::api::Replication; use ingot::geneve::GeneveFlags; use ingot::geneve::GeneveRef; use ingot::geneve::ValidGeneve; @@ -84,28 +149,24 @@ impl<'a> OptionCast<'a> for ValidOxideOption<'a> { } } +/// Geneve multicast option body carrying replication strategy information. +/// +/// This option encodes the replication scope as a 2-bit field in the top two +/// bits of the first byte of the option body. The remaining 30 bits are +/// reserved for future use. The replication strategy determines whether the +/// packet is delivered to local guest instances (External), underlay +/// infrastructure destinations (Underlay), or both (All). #[derive(Debug, Clone, Ingot, Eq, PartialEq)] #[ingot(impl_default)] pub struct MulticastInfo { + /// Replication scope encoded as a u2 (top 2 bits of the first byte). + /// Values map to `Replication::{External, Underlay, All, Reserved}`. #[ingot(is = "u2")] pub version: Replication, + /// Reserved bits (remaining 30 bits of the body). rsvd: u30be, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash)] -#[repr(u8)] -pub enum Replication { - /// Replicate packets to ports set for external multicast traffic. - #[default] - External = 0x00, - /// Replicate packets to ports set for underlay multicast traffic. - Underlay, - /// Replicate packets to ports set for underlay and external multicast - /// traffic (bifurcated). - All, - Reserved, -} - impl NetworkRepr for Replication { fn to_network(self) -> u2 { self as u8 @@ -118,7 +179,7 @@ impl NetworkRepr for Replication { 1 => Replication::Underlay, 2 => Replication::All, 3 => Replication::Reserved, - _ => panic!("outside bounds of u2"), + _ => unreachable!("u2 value out of range: {val}"), } } } @@ -157,6 +218,33 @@ pub fn validate_options( Ok(()) } +/// Extract multicast replication info from Geneve options. +/// Returns None if no multicast option is present, or Some(Replication) if found. +/// +/// Treats Reserved (value 3) as invalid and returns None, implementing fail-closed +/// behavior without crashing the parser. +/// +/// Note: This function silently skips options with parse errors (e.g., TooSmall). +/// Call `validate_options()` first if you want parse errors surfaced instead of +/// being silently ignored. +pub fn extract_multicast_replication( + pkt: &ValidGeneve, +) -> Option { + for opt in OxideOptions::from_raw(pkt) { + let Ok(opt) = opt else { continue }; + if let Some(ValidOxideOption::Multicast(mc_info)) = opt.option.known() { + let repl = mc_info.version(); + // Filter out Reserved (u2=3). This value exists in the 2-bit space + // but is not used by sidecar P4; treat as invalid. + if matches!(repl, Replication::Reserved) { + return None; + } + return Some(repl); + } + } + None +} + #[cfg(test)] pub fn valid_geneve_has_oxide_external( pkt: &ValidGeneve, @@ -177,6 +265,7 @@ pub fn valid_geneve_has_oxide_external( #[cfg(test)] mod test { use super::*; + use alloc::vec::Vec; use ingot::types::HeaderParse; use ingot::udp::ValidUdp; @@ -201,7 +290,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type @@ -219,6 +307,57 @@ mod test { assert!(valid_geneve_has_oxide_external(&geneve)); } + #[test] + fn parse_multicast_replication_values() { + // Build a minimal UDP+Geneve packet with one Oxide multicast option + // Body's first byte top-2 bits carry Replication. + fn build_buf(rep: Replication) -> Vec { + #[rustfmt::skip] + let mut buf = vec![ + // UDP source + 0x1E, 0x61, + // UDP dest + 0x17, 0xC1, + // UDP length (8 UDP hdr + 8 Geneve hdr + 4 opt hdr + 4 opt body = 24 = 0x18) + 0x00, 0x18, + // UDP csum + 0x00, 0x00, + // Geneve: ver + opt len (2 words = 8 bytes: 4 opt hdr + 4 opt body) + 0x02, + // Geneve flags + 0x00, + // Geneve proto + 0x65, 0x58, + // Geneve vni + reserved + 0x00, 0x00, 0x00, 0x00, + // Geneve option: class 0x0129 (Oxide) + 0x01, 0x29, + // Geneve option: flags+type (non-critical, Multicast = 0x01) + 0x01, + // Geneve option: rsvd + len (1 word = 4 bytes body) + 0x01, + ]; + // Geneve option body: 4-byte body with replication in top 2 bits + buf.push((rep as u8) << 6); + buf.extend_from_slice(&[0x00, 0x00, 0x00]); + buf + } + + for (rep, expect) in [ + (Replication::External, Replication::External), + (Replication::Underlay, Replication::Underlay), + (Replication::All, Replication::All), + ] { + let buf = build_buf(rep); + let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); + let (geneve, ..) = ValidGeneve::parse(rem).unwrap(); + validate_options(&geneve).unwrap(); + + let got = extract_multicast_replication(&geneve).unwrap(); + assert_eq!(got, expect); + } + } + #[test] fn unknown_crit_option_fails() { // Create a packet with one extension header with the critical @@ -242,7 +381,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -281,7 +419,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0x01, 0x29, // crt + type @@ -314,8 +451,8 @@ mod test { 0x1E, 0x61, // dest 0x17, 0xC1, - // length - 0x00, 0x1c, + // length (8 UDP hdr + 8 Geneve hdr + 20 options = 36 = 0x24) + 0x00, 0x24, // csum 0x00, 0x00, // ver + opt len @@ -326,14 +463,12 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type 0x00, // rsvd + len 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -342,7 +477,6 @@ mod test { 0x01, // body 0x00, 0x00, 0x00, 0x00, - // experimenter option class 0xff, 0xff, // crt + type diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index b4df96b1..111ccdf9 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -9,10 +9,12 @@ //! This implements the Oxide Network VPC Overlay. use super::geneve::OxideOptions; use super::router::RouterTargetInternal; +use crate::api::DEFAULT_MULTICAST_VNI; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; use crate::api::PhysNet; +use crate::api::Replication; use crate::api::TunnelEndpoint; use crate::api::V2bMapResp; use crate::api::VpcMapResp; @@ -81,6 +83,7 @@ pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, vni_state: Arc, + vpc_map: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -89,6 +92,7 @@ pub fn setup( cfg.phys_ip, cfg.vni, vni_state, + vpc_map, v2b, ))); @@ -183,6 +187,7 @@ pub struct EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, vni_state: Arc, + vpc_map: Arc, v2b: Arc, } @@ -191,9 +196,10 @@ impl EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, vni_state: Arc, + vpc_map: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, vni_state, v2b } + Self { phys_ip_src, vni, vni_state, vpc_map, v2b } } } @@ -241,55 +247,56 @@ impl StaticAction for EncapAction { } }; + // Map the router target to a physical network location. + // The router layer has already made the routing decision - we just + // execute it here by looking up the appropriate physical mapping. + let dst_ip = flow_id.dst_ip(); let (is_internal, phys_target, is_mcast) = match target { RouterTargetInternal::InternetGateway(_) => { - // TODO: Is landing mcast traffic in here right? My intuition says - // so atm, given that the address will be outside of the individual - // VPC subnets, and mcast send will apply outbound NAT (and we expect - // such frames could well leave the rack)! - // This may need a new RouterTargetInternal? And/or thought about the - // interaction w/ routers? - let dst_ip = flow_id.dst_ip(); - if dst_ip.is_multicast() { - match self.vni_state.m2p.get(&dst_ip) { - Some(phys) => ( - true, + match self.v2b.get(&dst_ip) { + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = f_hash as usize; + let target = match phys.iter().nth(hash % phys.len()) { + Some(target) => target, + None => return Ok(AllowOrDeny::Deny), + }; + ( + false, PhysNet { - ether: phys.dest_mac(), - ip: phys.0, - vni: self.vni, + ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), + ip: target.ip, + vni: target.vni, }, - true, - ), - - // Landing here implies we don't yet have an internal forwarding - // address for this multicast group, or this VNI does not have - // access to it. - None => return Ok(AllowOrDeny::Deny), + false, + ) } - } else { - match self.v2b.get(&dst_ip) { - Some(phys) => { - // Hash the packet onto a route target. This is a very - // rudimentary mechanism. Should level-up to an ECMP - // algorithm with well known statistical properties. - let hash = f_hash as usize; - let target = - match phys.iter().nth(hash % phys.len()) { - Some(target) => target, - None => return Ok(AllowOrDeny::Deny), - }; - ( - false, - PhysNet { - ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), - ip: target.ip, - vni: target.vni, - }, - false, - ) - } - None => return Ok(AllowOrDeny::Deny), + None => return Ok(AllowOrDeny::Deny), + } + } + + // Multicast target - use M2P mapping to get the multicast underlay address. + // The router has determined this packet should be multicast forwarded. + RouterTargetInternal::Multicast(_) => { + // Fleet-level multicast mappings live under DEFAULT_MULTICAST_VNI. + // Look up the underlay multicast IPv6 for this group using the + // global VPC mappings and encapsulate with the fleet multicast VNI. + let mvni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + match self.vpc_map.get_mcast_underlay(mvni, dst_ip) { + Some(underlay) => ( + true, + PhysNet { + ether: underlay.dst_mac(), + ip: underlay.0, + vni: mvni, + }, + true, + ), + None => { + // No mapping configured for this group; deny. + return Ok(AllowOrDeny::Deny); } } } @@ -364,7 +371,18 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; - static GENEVE_MCAST_OPT_BODY: &[u8] = &[0; size_of::()]; + // For multicast originated from this host, we set External replication. + // The actual replication scope will be determined by the mcast_fwd table. + // The first byte encodes Replication in the top 2 bits: + // External=0x00, Underlay=0x40, All=0x80, Reserved=0xC0 + const REPLICATION_EXTERNAL_BYTE: u8 = + (Replication::External as u8) << 6; + static GENEVE_MCAST_OPT_BODY: &[u8] = &[ + REPLICATION_EXTERNAL_BYTE, // Top 2 bits encode replication strategy + 0x00, + 0x00, + 0x00, // Reserved bytes + ]; static GENEVE_MCAST_OPT: ArbitraryGeneveOption = ArbitraryGeneveOption { option_class: GENEVE_OPT_CLASS_OXIDE, @@ -387,14 +405,20 @@ impl StaticAction for EncapAction { }) .expect("Ethernet validation is infallible"), ), - outer_ip: HeaderAction::Push(Valid::validated(IpPush::from( - Ipv6Push { + outer_ip: HeaderAction::Push({ + let ip_push = IpPush::from(Ipv6Push { src: self.phys_ip_src, dst: phys_target.ip, proto: Protocol::UDP, exts: Cow::Borrowed(&[]), - }, - ))?), + }); + match Valid::validated(ip_push) { + Ok(v) => v, + Err(e) => { + return Err(e.into()); + } + } + }), // XXX Geneve uses the UDP source port as a flow label // value for the purposes of ECMP -- a hash of the // 5-tuple. However, when using Geneve in IPv6 one could @@ -437,7 +461,9 @@ impl StaticAction for EncapAction { &GENEVE_MCAST_OPT, )), (false, false) => Cow::Borrowed(&[]), - // TCP is not exactly multicast compatible. + // We do not support TCP over multicast delivery. + // Multicast replication semantics conflict with TCP's + // connection/ordering guarantees, so deny this case. (true, true) => { return Ok(AllowOrDeny::Deny); } @@ -548,6 +574,11 @@ pub struct VpcMappings { } impl VpcMappings { + /// Generate a new mapping struct. + pub fn new() -> Self { + Self { inner: KMutex::new(BTreeMap::new()) } + } + /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { @@ -611,8 +642,62 @@ impl VpcMappings { None } - pub fn new() -> Self { - VpcMappings { inner: KMutex::new(BTreeMap::new()) } + /// Add a multicast forwarding entry from a multicast group IP to a physical + /// underlay IP. + /// + /// Returns an error if: + /// - The VNI is not DEFAULT_MULTICAST_VNI + /// - The underlay address is not a valid IPv6 multicast address + pub fn add_mcast( + &self, + group: IpAddr, + underlay: Ipv6Addr, + vni: Vni, + ) -> Result, OpteError> { + // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast + if vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: illumos_sys_hdrs::EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", + vni.as_u32() + ), + }); + } + + let mut lock = self.inner.lock(); + let state = lock.entry(vni).or_default(); + + let mcast_underlay = MulticastUnderlay::new(underlay).ok_or_else(|| { + OpteError::InvalidUnderlayMulticast(format!( + "underlay address must be an administratively-scoped multicast address \ + (scope 0x4/admin-local, 0x5/site-local, or 0x8/organization-local): {underlay}", + )) + })?; + + state.m2p.set(group, mcast_underlay); + Ok(state.clone()) + } + + /// Delete a multicast forwarding entry. + pub fn del_mcast(&self, group: IpAddr, _underlay: Ipv6Addr, vni: Vni) { + let mut lock = self.inner.lock(); + if let Some(state) = lock.get_mut(&vni) { + state.m2p.remove(&group); + } + } + + /// Get the underlay multicast for a given VNI and overlay multicast group. + pub fn get_mcast_underlay( + &self, + vni: Vni, + group: IpAddr, + ) -> Option { + let lock = self.inner.lock(); + lock.get(&vni).and_then(|state| match group { + IpAddr::Ip4(ip4) => state.m2p.ip4.lock().get(&ip4).copied(), + IpAddr::Ip6(ip6) => state.m2p.ip6.lock().get(&ip6).copied(), + }) } } @@ -664,15 +749,23 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } -// XXX Isn't this really just a V2P mapping, without a guest MAC? +// NOTE: This is structurally similar to V2P mapping, but maps to MulticastUnderlay +// which wraps only an IPv6 address. The destination MAC is derived algorithmically +// from the IPv6 multicast address rather than stored explicitly. /// A mapping from inner multicast destination IPs to underlay multicast groups. +/// +/// Validation is enforced through the `MulticastUnderlay` newtype wrapper, which +/// ensures only valid IPv6 multicast addresses can be stored. pub struct Mcast2Phys { - // XXX In theory this is vulnerable to the same concerns around validation - // as `Virt2Phys`. ip4: KMutex>, ip6: KMutex>, } +/// Per-VNI mapping state containing both unicast and multicast address mappings. +/// +/// This struct holds all address-to-physical mappings organized by VNI: +/// - `v2p`: Unicast virtual IPs to physical locations +/// - `m2p`: Multicast group IPs to physical underlay addresses #[derive(Default)] pub struct PerVniMaps { pub v2p: Virt2Phys, @@ -903,6 +996,7 @@ impl MappingResource for Virt2Phys { } impl Mcast2Phys { + /// Create a new empty multicast-to-physical mapping table. pub fn new() -> Self { Self { ip4: KMutex::new(BTreeMap::new()), @@ -910,10 +1004,12 @@ impl Mcast2Phys { } } + /// Dump all IPv4 overlay multicast group to underlay IPv6 multicast mappings. pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { self.ip4.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() } + /// Dump all IPv6 overlay multicast group to underlay IPv6 multicast mappings. pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { self.ip6.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() } @@ -925,15 +1021,32 @@ impl Default for Mcast2Phys { } } +/// An overlay multicast group address mapped to the underlay (outer) IPv6 multicast address. +/// +/// This type ensures that the wrapped IPv6 address is a valid multicast address +/// with administrative scope (admin-local, site-local, or organization-local). +/// +/// Administrative scopes per RFC 4291 and RFC 7346: +/// - `0x4`: admin-local scope +/// - `0x5`: site-local scope +/// - `0x8`: organization-local scope #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub struct MulticastUnderlay(Ipv6Addr); impl MulticastUnderlay { + /// Create a new `MulticastUnderlay` if the address is a valid + /// administratively-scoped multicast IPv6 address (scope 0x4, 0x5, or 0x8). pub fn new(addr: Ipv6Addr) -> Option { - if addr.is_multicast() { Some(Self(addr)) } else { None } + if addr.is_admin_scoped_multicast() { Some(Self(addr)) } else { None } + } + + /// Return the underlying IPv6 multicast address. + pub fn addr(&self) -> Ipv6Addr { + self.0 } - fn dest_mac(&self) -> MacAddr { + /// Return the destination MAC address derived from the IPv6 multicast address. + fn dst_mac(&self) -> MacAddr { self.0.unchecked_multicast_mac() } } diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index cabe96e5..11263c63 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -65,6 +65,7 @@ pub enum RouterTargetInternal { InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), + Multicast(IpCidr), } impl RouterTargetInternal { @@ -86,6 +87,7 @@ impl RouterTargetInternal { } RouterTargetInternal::Ip(_) => RouterTargetClass::Ip, RouterTargetInternal::VpcSubnet(_) => RouterTargetClass::VpcSubnet, + RouterTargetInternal::Multicast(_) => RouterTargetClass::Multicast, } } } @@ -117,6 +119,16 @@ impl ActionMetaValue for RouterTargetInternal { Ok(Self::VpcSubnet(IpCidr::Ip6(cidr6))) } + Some(("mcast4", cidr4_s)) => { + let cidr4 = cidr4_s.parse::()?; + Ok(Self::Multicast(IpCidr::Ip4(cidr4))) + } + + Some(("mcast6", cidr6_s)) => { + let cidr6 = cidr6_s.parse::()?; + Ok(Self::Multicast(IpCidr::Ip6(cidr6))) + } + Some(("ig", ig)) => { let ig = ig.parse::().map_err(|e| e.to_string())?; Ok(Self::InternetGateway(Some(ig))) @@ -141,6 +153,12 @@ impl ActionMetaValue for RouterTargetInternal { Self::VpcSubnet(IpCidr::Ip6(cidr6)) => { format!("sub6={cidr6}").into() } + Self::Multicast(IpCidr::Ip4(mcast4)) => { + format!("mcast4={mcast4}").into() + } + Self::Multicast(IpCidr::Ip6(mcast6)) => { + format!("mcast6={mcast6}").into() + } } } } @@ -151,6 +169,7 @@ impl fmt::Display for RouterTargetInternal { Self::InternetGateway(addr) => format!("IG({addr:?})"), Self::Ip(addr) => format!("IP: {addr}"), Self::VpcSubnet(sub) => format!("Subnet: {sub}"), + Self::Multicast(mcast) => format!("Multicast: {mcast}"), }; write!(f, "{s}") } @@ -161,6 +180,7 @@ pub enum RouterTargetClass { InternetGateway, Ip, VpcSubnet, + Multicast, } impl ActionMetaValue for RouterTargetClass { @@ -171,6 +191,7 @@ impl ActionMetaValue for RouterTargetClass { "ig" => Ok(Self::InternetGateway), "ip" => Ok(Self::Ip), "subnet" => Ok(Self::VpcSubnet), + "mcast" => Ok(Self::Multicast), _ => Err(format!("bad router target class: {s}")), } } @@ -180,6 +201,7 @@ impl ActionMetaValue for RouterTargetClass { Self::InternetGateway => "ig".into(), Self::Ip => "ip".into(), Self::VpcSubnet => "subnet".into(), + Self::Multicast => "mcast".into(), } } } @@ -190,6 +212,7 @@ impl fmt::Display for RouterTargetClass { Self::InternetGateway => write!(f, "IG"), Self::Ip => write!(f, "IP"), Self::VpcSubnet => write!(f, "Subnet"), + Self::Multicast => write!(f, "Multicast"), } } } @@ -278,6 +301,8 @@ fn valid_router_dest_target_pair(dest: &IpCidr, target: &RouterTarget) -> bool { (_, RouterTarget::Drop) | // Internet gateways are valid for any IP family. (_, RouterTarget::InternetGateway(_)) | + // Multicast targets are valid for any IP family + (_, RouterTarget::Multicast(_)) | // IPv4 destination, IPv4 address (IpCidr::Ip4(_), RouterTarget::Ip(IpAddr::Ip4(_))) | // IPv4 destination, IPv4 subnet @@ -362,6 +387,22 @@ fn make_rule( ))); (predicate, action) } + + RouterTarget::Multicast(mcast) => { + let predicate = match dest { + IpCidr::Ip4(ip4) => { + Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(ip4)]) + } + + IpCidr::Ip6(ip6) => { + Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(ip6)]) + } + }; + let action = Action::Meta(Arc::new(RouterAction::new( + RouterTargetInternal::Multicast(mcast), + ))); + (predicate, action) + } }; let priority = compute_rule_priority(&dest, class); diff --git a/lib/oxide-vpc/src/print.rs b/lib/oxide-vpc/src/print.rs index c6a46ef3..f69a8b4c 100644 --- a/lib/oxide-vpc/src/print.rs +++ b/lib/oxide-vpc/src/print.rs @@ -9,6 +9,7 @@ //! This is mostly just a place to hang printing routines so that they //! can be used by both opteadm and integration tests. +use crate::api::DumpMcastForwardingResp; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; @@ -135,3 +136,38 @@ fn print_v2p_ip6( std::net::Ipv6Addr::from(phys.ip.bytes()), ) } + +/// Print the header for the [`print_mcast_fwd()`] output. +fn print_mcast_fwd_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "GROUP IP\tUNDERLAY IP\tVNI\tREPLICATION") +} + +/// Print a [`DumpMcastForwardingResp`]. +pub fn print_mcast_fwd(resp: &DumpMcastForwardingResp) -> std::io::Result<()> { + print_mcast_fwd_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastForwardingResp`] into a given writer. +pub fn print_mcast_fwd_into( + writer: &mut impl Write, + resp: &DumpMcastForwardingResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Forwarding Table")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_fwd_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + for (next_hop, replication) in &entry.next_hops { + writeln!( + t, + "{}\t{}\t{}\t{replication:?}", + entry.group, next_hop.addr, next_hop.vni + )?; + } + } + writeln!(t)?; + t.flush() +} diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index fe3454d6..57a1d541 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -36,6 +36,7 @@ use opte::engine::ip::v4::Ipv4Addr; use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v4::ValidIpv4; use opte::engine::ip::v6::Ipv6; +use opte::engine::ip::v6::Ipv6Addr; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::ip::v6::ValidIpv6; use opte::engine::packet::InnerFlowId; @@ -43,10 +44,14 @@ use opte::engine::packet::MblkFullParsed; use opte::engine::packet::MismatchError; use opte::engine::packet::Packet; use opte::engine::parse::ValidUlp; +use opte::engine::port::DropReason; use opte::engine::port::ProcessError; +use opte::engine::port::ProcessResult; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; +use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; use opte::ingot::icmp::IcmpV6Ref; +use opte::ingot::ip::IpProtocol; use opte::ingot::tcp::TcpRef; use opte::ingot::types::Emit; use opte::ingot::types::HeaderLen; @@ -59,6 +64,7 @@ use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; use oxide_vpc::api::VpcCfg; +use oxide_vpc::engine::geneve; use pcap::*; use smoltcp::phy::ChecksumCapabilities as CsumCapab; use smoltcp::wire::Icmpv4Packet; @@ -4678,7 +4684,7 @@ fn icmp_inner_has_nat_applied() { header: smoltcp::wire::Ipv4Repr { src_addr: remote_addr.into(), dst_addr: g1_cfg.ipv4().private_ip.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, payload_len: 256, hop_limit: 0, }, @@ -4747,7 +4753,7 @@ fn icmpv6_inner_has_nat_applied() { header: smoltcp::wire::Ipv6Repr { src_addr: eph_ip.into(), dst_addr: remote_addr.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, // Unimportant -- header is truncated. payload_len: 256, hop_limit: 255, @@ -4811,3 +4817,338 @@ fn icmpv6_inner_has_nat_applied() { let (v6, ..) = ValidIpv6::parse(body).unwrap(); assert_eq!(v6.source(), g1_cfg.ipv6().private_ip); } + +// Test that IPv6 multicast packets get encapsulated with Geneve +#[test] +fn test_ipv6_multicast_encapsulation() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast packet (ff04::1:3 - admin-local multicast) + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Create a multicast underlay address (must be multicast for forwarding) + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Add multicast forwarding entry BEFORE starting the port + let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Add router entry for IPv6 multicast traffic (ff00::/8) via Multicast target + router::add_entry( + &g1.port, + IpCidr::Ip6("ff00::/8".parse().unwrap()), + RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), + RouterClass::System, + ) + .unwrap(); + incr!(g1, ["epoch", "router.rules.out"]); + + // Build a UDP packet to the multicast address + // (TCP + multicast is incompatible and would be denied) + let eth = Ethernet { + destination: MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + source: g1_cfg.guest_mac, + ethertype: Ethertype::IPV6, + }; + let ip = Ipv6 { + source: g1_cfg.ipv6().private_ip, + destination: mcast_dst, + next_header: IpProtocol::UDP, + payload_len: (Udp::MINIMUM_LENGTH) as u16, + hop_limit: 64, + ..Default::default() + }; + let udp = Udp { + source: 12345, + destination: 5353, // mDNS port as an example multicast UDP service + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + let mut pkt_m = ulp_pkt(eth, ip, udp, &[]); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was encapsulated + let Ok(Modified(spec)) = res else { + panic!("Expected Modified result, got {res:?}"); + }; + let mut pkt_m = spec.apply(pkt_m); + + // Parse the encapsulated packet as inbound (it's now on the wire with Geneve) + let parsed = Packet::parse_inbound(pkt_m.iter_mut(), VpcParser {}).unwrap(); + let meta = parsed.meta(); + + // Verify the outer IPv6 destination is the multicast underlay address + assert_eq!( + meta.outer_v6.destination(), + mcast_underlay, + "Outer IPv6 destination should be multicast underlay address" + ); + + // Verify the outer IPv6 source is the physical IP of the guest + assert_eq!( + meta.outer_v6.source(), + g1_cfg.phys_ip, + "Outer IPv6 source should be the physical IP" + ); + + // Verify the outer Ethernet destination MAC is the IPv6 multicast MAC + // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the last 4 bytes of the IPv6 address + let expected_outer_mac = mcast_underlay.multicast_mac().unwrap(); + assert_eq!( + meta.outer_eth.destination(), + expected_outer_mac, + "Outer Ethernet MAC should be IPv6 multicast MAC" + ); + + // Verify we have Geneve encapsulation with the correct VNI (fleet multicast VNI) + assert_eq!( + meta.outer_encap.vni(), + mcast_vni, + "Geneve VNI should match DEFAULT_MULTICAST_VNI" + ); + + // Verify the Geneve multicast option is present with External replication + let replication = geneve::extract_multicast_replication(&meta.outer_encap) + .expect("Geneve packet should have multicast option"); + assert_eq!( + replication, + oxide_vpc::api::Replication::External, + "Multicast option should have External replication" + ); +} + +// Test that TCP + multicast packets are denied (TCP is incompatible with multicast) +#[test] +fn test_tcp_multicast_denied() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast address + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + + g1.port.start(); + set!(g1, "port_state=running"); + + router::add_entry( + &g1.port, + IpCidr::Ip6("ff00::/8".parse().unwrap()), + RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), + RouterClass::System, + ) + .unwrap(); + incr!(g1, ["epoch", "router.rules.out"]); + + // Build a TCP packet to the multicast address (should be denied) + let mut pkt_m = http_syn3( + g1_cfg.guest_mac, + g1_cfg.ipv6().private_ip, + MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + mcast_dst, + 12345, + 80, + ); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was denied (TCP + multicast is incompatible) + match res { + Ok(Hairpin(_)) => panic!("Expected packet to be denied, got Hairpin"), + Ok(Modified(_)) => panic!("Expected packet to be denied, got Modified"), + Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) => { + // Expected - TCP + multicast is denied by overlay layer + } + other => panic!("Expected Drop with Layer reason, got: {:?}", other), + } +} + +// Ensure packets with unknown critical Geneve options are rejected during +// option validation (fail-closed on unrecognised critical options). +#[test] +fn test_drop_on_unknown_critical_option() { + // Build Ethernet + IPv6 (with no extensions) + UDP + Geneve header + // carrying a single unknown critical option (class=0xffff, type=0x80, len=0). + // Minimal inner Ethernet + IPv4 + UDP follows to satisfy the parser. + let mut buf: Vec = Vec::new(); + + // Ethernet (14B) + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, // dst + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, // src + 0x86, 0xdd, // ethertype IPv6 + ]); + + // IPv6 header (40B) + // ver/tc/fl, payload_len, next_header=UDP(17), hop_limit + // payload_len = UDP length (we'll compute) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, // ver+tc+fl + 0x00, 0x00, // payload length (placeholder) + 0x11, // next header UDP + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source port + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=1 (4B option header), flags=critical opts + buf.extend_from_slice(&[ + 0x01, // ver=0, optlen=1 word (4B option header) + 0x40, // flags: critical options present + 0x65, 0x58, // protocol type 0x6558 + 0x00, 0x00, 0x00, 0x00, // VNI=0, reserved + ]); + // Unknown critical option: class=0xffff, type=0x80 (critical), len=0 + buf.extend_from_slice(&[ + 0xff, 0xff, // class + 0x80, // critical + type + 0x00, // rsvd+len=0 + ]); + // No body (len=0) + + // Minimal inner Ethernet + IPv4 + UDP (to satisfy inner parse) + buf.extend_from_slice(&[ + // inner Ethernet + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, // IPv4 + // inner IPv4 (20B) + 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, 0x0a, 0x00, + 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, // src=10.0.0.1, dst=10.0.0.2 + // inner UDP (8B) + 0x12, 0x34, 0x13, 0x37, 0x00, 0x08, 0x00, 0x00, + ]); + + // Compute UDP length and IPv6 payload length + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly from the UDP payload (skip L2/L3) and validate options + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header"); + assert!(matches!( + geneve::validate_options(&geneve), + Err(opte::engine::packet::ParseError::UnrecognisedTunnelOpt { .. }) + )); +} + +// Ensure Geneve parsing works correctly when an IPv6 extension header is present +// before UDP (e.g., Hop-by-Hop). Verifies that option walking is positioned at +// the correct Geneve offset. +#[test] +fn test_v6_ext_hdr_geneve_offset_ok() { + let mut buf: Vec = Vec::new(); + + // Ethernet + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, + 0x86, 0xdd, + ]); + + // IPv6 header (Next Header = Hop-by-Hop (0)) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, 0x00, + 0x00, // payload length (placeholder) + 0x00, // next header: Hop-by-Hop + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Hop-by-Hop extension header (8B) -> next header UDP (17), hdr ext len=0 + buf.extend_from_slice(&[0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=2 (8B option area), flags=0 + buf.extend_from_slice(&[0x02, 0x00, 0x65, 0x58, 0x00, 0x00, 0x00, 0x00]); + // Multicast option: class=0x0129, type=0x01, len=1; body=4B with External + buf.extend_from_slice(&[ + 0x01, + 0x29, + 0x01, + 0x01, // class, type, rsvd+len + (oxide_vpc::api::Replication::External as u8) << 6, + 0x00, + 0x00, + 0x00, + ]); + + // Minimal inner Ethernet + IPv4 + UDP + buf.extend_from_slice(&[ + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, + 0x0a, 0x00, 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, 0x12, 0x34, 0x13, 0x37, + 0x00, 0x08, 0x00, 0x00, + ]); + + // Set UDP and IPv6 payload lengths + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly after IPv6 ext header and UDP, then check multicast option + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*hop-by-hop*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header after ext hdr"); + let repl = geneve::extract_multicast_replication(&geneve) + .expect("multicast option present"); + assert_eq!(repl, oxide_vpc::api::Replication::External); +} diff --git a/xde-tests/Cargo.toml b/xde-tests/Cargo.toml index 84e0d5bd..6ca3dc3a 100644 --- a/xde-tests/Cargo.toml +++ b/xde-tests/Cargo.toml @@ -8,6 +8,7 @@ repository.workspace = true [dependencies] opte-ioctl.workspace = true +opte-test-utils.workspace = true oxide-vpc.workspace = true anyhow.workspace = true diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 2fd8a634..d2908fe3 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -5,10 +5,15 @@ // Copyright 2025 Oxide Computer Company use anyhow::Result; +use anyhow::anyhow; +use anyhow::bail; use opte_ioctl::OpteHdl; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::Direction; use oxide_vpc::api::ExternalIpCfg; @@ -21,27 +26,41 @@ use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::PhysNet; use oxide_vpc::api::Ports; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; +use oxide_vpc::api::SNat6Cfg; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::Vni; use oxide_vpc::api::VpcCfg; use rand::Rng; +use std::cell::RefCell; use std::collections::HashSet; +use std::process::Child; use std::process::Command; +use std::process::Stdio; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; use zone::Zlogin; pub use ztest::*; -/// The overlay network used in all tests. +/// The IPv4 overlay network used in all tests. pub const OVERLAY_NET: &str = "10.0.0.0/24"; -/// The overlay OPTE gateway used in all tests. +/// The IPv4 overlay OPTE gateway used in all tests. pub const OVERLAY_GW: &str = "10.0.0.254"; +/// The IPv6 overlay network used in all tests. +pub const OVERLAY_NET_V6: &str = "fd00::/64"; +/// The IPv6 overlay OPTE gateway used in all tests. +pub const OVERLAY_GW_V6: &str = "fd00::254"; /// This is a wrapper around the ztest::Zone object that encapsulates common /// logic needed for running the OPTE tests zones used in this test suite. @@ -58,15 +77,48 @@ impl OpteZone { Ok(Self { zone }) } - /// Wait for the network to come up, then set up the overlay network. + /// Wait for the network to come up, then set up the IPv4 overlay network. fn setup(&self, devname: &str, addr: String) -> Result<()> { self.zone.wait_for_network()?; + // Configure IPv4 via DHCP self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {}/test", devname))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/test"))?; + self.zone.zexec(&format!("route add -iface {OVERLAY_GW} {addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + Ok(()) + } + + /// Wait for the network to come up, then set up dual-stack (IPv4 + IPv6) overlay network. + fn setup_dualstack( + &self, + devname: &str, + ipv4_addr: String, + ipv6_addr: String, + ) -> Result<()> { + self.zone.wait_for_network()?; + // Configure IPv4 via DHCP self.zone - .zexec(&format!("route add -iface {} {}", OVERLAY_GW, addr))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/testv4"))?; self.zone - .zexec(&format!("route add {} {}", OVERLAY_NET, OVERLAY_GW))?; + .zexec(&format!("route add -iface {OVERLAY_GW} {ipv4_addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + + // Configure IPv6 with static address + // Use addrconf first to enable IPv6 on the interface, then add static address + self.zone.zexec(&format!( + "ipadm create-addr -t -T addrconf {devname}/addrconf" + ))?; + // Small delay to let addrconf initialize + std::thread::sleep(Duration::from_millis(500)); + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {ipv6_addr}/64 {devname}/testv6" + ))?; + self.zone.zexec(&format!( + "route add -inet6 -iface {OVERLAY_GW_V6} {ipv6_addr}" + ))?; + self.zone.zexec(&format!( + "route add -inet6 {OVERLAY_NET_V6} {OVERLAY_GW_V6}" + ))?; Ok(()) } } @@ -77,6 +129,7 @@ impl OpteZone { pub struct OptePort { name: String, cfg: VpcCfg, + mcast_subscriptions: RefCell>, } impl OptePort { @@ -106,12 +159,67 @@ impl OptePort { }), guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + phys_ip: phys_ip.parse().unwrap(), + }; + let adm = OpteHdl::open()?; + adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) + } + + /// Create a new OPTE port with dual-stack (IPv4 + IPv6) support. + pub fn new_dualstack( + name: &str, + private_ip_v4: &str, + private_ip_v6: &str, + guest_mac: &str, + phys_ip: &str, + ) -> Result { + let cfg = VpcCfg { + ip_cfg: IpCfg::DualStack { + ipv4: Ipv4Cfg { + vpc_subnet: OVERLAY_NET.parse().unwrap(), + private_ip: private_ip_v4.parse().unwrap(), + gateway_ip: OVERLAY_GW.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat4Cfg { + external_ip: "1.2.3.4".parse().unwrap(), + ports: 1000..=2000, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + ipv6: Ipv6Cfg { + vpc_subnet: OVERLAY_NET_V6.parse().unwrap(), + private_ip: private_ip_v6.parse().unwrap(), + gateway_ip: OVERLAY_GW_V6.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat6Cfg { + external_ip: "2001:db8::1".parse().unwrap(), + ports: 4097..=8192, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + }, + guest_mac: guest_mac.parse().unwrap(), + gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), }; let adm = OpteHdl::open()?; adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; - Ok(OptePort { name: name.into(), cfg }) + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) } /// Add an overlay routing entry to this port. @@ -150,11 +258,20 @@ impl OptePort { self.cfg.guest_mac.bytes() } - /// Return the guest IP address as a string. + /// Return the guest IPv4 address as a string. pub fn ip(&self) -> String { match &self.cfg.ip_cfg { IpCfg::Ipv4(cfg) => cfg.private_ip.to_string(), - _ => panic!("expected ipv4 guest"), + IpCfg::DualStack { ipv4, .. } => ipv4.private_ip.to_string(), + _ => panic!("expected ipv4 or dualstack guest"), + } + } + + /// Return the guest IPv6 address as a string (for dual-stack ports). + pub fn ipv6(&self) -> Option { + match &self.cfg.ip_cfg { + IpCfg::DualStack { ipv6, .. } => Some(ipv6.private_ip.to_string()), + _ => None, } } @@ -162,6 +279,53 @@ impl OptePort { pub fn underlay_ip(&self) -> std::net::Ipv6Addr { self.cfg.phys_ip.into() } + + /// Return the port name. + pub fn name(&self) -> &str { + &self.name + } + + /// Subscribe this port to a multicast group. + /// Automatically tracks the subscription for cleanup on drop. + pub fn subscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_subscribe(&McastSubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().push(group); + Ok(()) + } + + /// Unsubscribe this port from a multicast group. + pub fn unsubscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().retain(|g| *g != group); + Ok(()) + } + + /// Add a multicast router entry for this port. + pub fn add_multicast_router_entry(&self, cidr: IpCidr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.add_router_entry(&AddRouterEntryReq { + port_name: self.name.clone(), + dest: cidr, + target: RouterTarget::Multicast(cidr), + class: RouterClass::System, + })?; + Ok(()) + } + + /// Allow multicast CIDR through the overlay firewall for the given direction. + pub fn allow_cidr(&self, cidr: IpCidr, direction: Direction) -> Result<()> { + let adm = OpteHdl::open()?; + adm.allow_cidr(&self.name, cidr, direction)?; + Ok(()) + } } impl Drop for OptePort { @@ -174,8 +338,23 @@ impl Drop for OptePort { return; } }; + + // Clean up multicast subscriptions + let subscriptions = self.mcast_subscriptions.borrow().clone(); + for group in subscriptions { + if let Err(e) = adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + }) { + let name = &self.name; + eprintln!( + "failed to unsubscribe {name} from multicast group {group}: {e}" + ); + } + } + if let Err(e) = adm.delete_xde(&self.name) { - eprintln!("failed to delete xde on drop: {}", e); + eprintln!("failed to delete xde on drop: {e}"); } } } @@ -202,26 +381,143 @@ impl Xde { phys: PhysNet { ether: ether.parse().unwrap(), ip: ip.parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), }, })?; Ok(()) } } impl Drop for Xde { - /// When this object is dropped, remove the xde kernel module from the - /// underlying system. fn drop(&mut self) { - // The module can no longer be successfully removed until the underlay - // has been cleared. This may not have been done, so this is fallible. + // Clear underlay to release references to simnet/vnic devices, + // allowing their cleanup to proceed. Driver remains loaded. if let Ok(adm) = OpteHdl::open() { let _ = adm.clear_xde_underlay(); } + } +} + +/// Helper to run `snoop` and ensure it doesn't outlive the test. +/// +/// This avoids leaked `snoop` processes pinning DLPI devices (causing EBUSY) +/// when tests time out. +pub struct SnoopGuard { + child: Option, +} + +impl SnoopGuard { + /// Start a `snoop` capture on `dev_name` with the provided BPF-like `filter`. + /// Captures a single packet (`-c 1`) and dumps hex output (`-x0`). + /// Uses `-r` to disable name resolution for deterministic numeric output. + pub fn start(dev_name: &str, filter: &str) -> anyhow::Result { + let child = Command::new("pfexec") + .args(&[ + "snoop", "-r", "-d", dev_name, "-c", "1", "-P", "-x0", filter, + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + Ok(Self { child: Some(child) }) + } + + /// Wait for completion with a timeout. Returns stdout if successful. + pub fn wait_with_timeout( + &mut self, + timeout: Duration, + ) -> anyhow::Result { + let deadline = Instant::now() + timeout; + + loop { + let child = self.child.as_mut().expect("child already taken"); + match child.try_wait()? { + Some(_status) => { + // Child exited; collect output. + let child = self.child.take().expect("child already taken"); + return Ok(child.wait_with_output()?); + } + None => { + if Instant::now() >= deadline { + // Timed out; kill snoop so it doesn't hold interfaces open. + let _ = child.kill(); + let _ = child.wait(); + bail!("snoop capture timed out"); + } + std::thread::sleep(Duration::from_millis(50)); + } + } + } + } +} + +impl Drop for SnoopGuard { + fn drop(&mut self) { + if let Some(child) = &mut self.child { + if let Ok(None) = child.try_wait() { + let _ = child.kill(); + let _ = child.wait(); + } + } + } +} + +/// Global multicast group state that cleans up M2P mappings and forwarding +/// entries on drop. Port-specific subscriptions are handled automatically by +/// OptePort::drop(). +/// +/// Use this to set up multicast groups in tests. Port subscriptions should use +/// `port.subscribe_multicast(group)` which tracks cleanup automatically. +pub struct MulticastGroup { + pub group: IpAddr, + pub underlay: Ipv6Addr, + pub vni: Vni, +} + +impl MulticastGroup { + pub fn new(group: IpAddr, underlay: Ipv6Addr, vni: Vni) -> Result { + let hdl = OpteHdl::open()?; + hdl.set_m2p(&SetMcast2PhysReq { group, underlay, vni })?; + Ok(Self { group, underlay, vni }) + } + + /// Set multicast forwarding entries for this group. + pub fn set_forwarding( + &self, + next_hops: Vec<( + oxide_vpc::api::NextHopV6, + oxide_vpc::api::Replication, + )>, + ) -> Result<()> { + let hdl = OpteHdl::open()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { + group: self.group, + next_hops, + })?; + Ok(()) + } +} + +impl Drop for MulticastGroup { + fn drop(&mut self) { + let Ok(hdl) = OpteHdl::open() else { + eprintln!("failed to open xde device for multicast cleanup"); + return; + }; + + // Clear forwarding entry + let group = self.group; + if let Err(e) = + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { group: self.group }) + { + eprintln!("failed to clear multicast forwarding for {group}: {e}"); + } - let mut cmd = Command::new("pfexec"); - cmd.args(["rem_drv", "xde"]); - if let Err(e) = cmd.output() { - eprintln!("failed to remove xde driver: {}", e); + // Clear M2P mapping + if let Err(e) = hdl.clear_m2p(&ClearMcast2PhysReq { + group: self.group, + underlay: self.underlay, + vni: self.vni, + }) { + eprintln!("failed to clear M2P mapping for {group}: {e}"); } } } @@ -244,6 +540,9 @@ impl TestNode { /// A topology of local zones interconnected with simlinks over /// an OPTE dataplane. // Note: these fields have a *very* sensitive drop order. +// Rust drops fields in declaration order. Zones must drop FIRST (to release +// references to network devices), then network infrastructure can clean up. +// Drop order: nodes -> null_ports -> v6_routes -> xde -> lls -> vnics -> simnet -> zfs pub struct Topology { pub nodes: Vec, pub null_ports: Vec, @@ -288,6 +587,14 @@ pub struct Topology { /// sanity checker to make sure basic opte/xde functionality is working - and /// that we're not hitting things like debug asserts in the OS. pub fn two_node_topology(brand: &str) -> Result { + two_node_topology_named(brand, "a", "b") +} + +pub fn two_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { // Create the "underlay loopback". With simnet device pairs, any packet that // goes in one is forwarded to the other. In the topology depicted above, // this means that anything vopte0 sends, will be encapsulated onto the @@ -349,29 +656,198 @@ pub fn two_node_topology(brand: &str) -> Result { let zfs = Arc::new(Zfs::new("opte2node")?); // Create a pair of zones to simulate our VM instances. - println!("start zone a"); - let a = OpteZone::new("a", &zfs, &[&opte0.name], brand)?; - println!("start zone b"); - let b = OpteZone::new("b", &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; - println!("setup zone a"); + println!("setup zone {zone_a_name}"); a.setup(&opte0.name, opte0.ip())?; - println!("setup zone b"); + println!("setup zone {zone_b_name}"); b.setup(&opte1.name, opte1.ip())?; Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + ], + null_ports: vec![], + v6_routes: vec![r0, r1], xde, lls: vec![ll0, ll1], vnics: vec![vn0, vn1], simnet: Some(sim), + zfs, + }) +} + +pub fn two_node_topology_dualstack(brand: &str) -> Result { + two_node_topology_dualstack_named(brand, "a", "b") +} + +pub fn two_node_topology_dualstack_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up v2p mappings (same as IPv4-only version) + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + + // Create dual-stack OPTE ports + let opte0 = OptePort::new_dualstack( + "opte0", + "10.0.0.1", + "fd00::1", + "a8:40:25:ff:00:01", + "fd44::1", + )?; + opte0.add_router_entry("10.0.0.2")?; + opte0.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + let opte1 = OptePort::new_dualstack( + "opte1", + "10.0.0.2", + "fd00::2", + "a8:40:25:ff:00:02", + "fd77::1", + )?; + opte1.add_router_entry("10.0.0.1")?; + opte1.fw_allow_all()?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte2node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup_dualstack(&opte0.name, opte0.ip(), "fd00::1".to_string())?; + + println!("setup zone {zone_b_name}"); + b.setup_dualstack(&opte1.name, opte1.ip(), "fd00::2".to_string())?; + + Ok(Topology { nodes: vec![ TestNode { zone: a, port: opte0 }, TestNode { zone: b, port: opte1 }, ], + null_ports: vec![], v6_routes: vec![r0, r1], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), zfs, + }) +} + +pub fn three_node_topology(brand: &str) -> Result { + three_node_topology_named(brand, "a", "b", "c") +} + +pub fn three_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, + zone_c_name: &str, +) -> Result { + // Create three-node topology for testing multicast fanout + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up V2P mappings for three nodes + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + Xde::set_v2p("10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + + // Create three OPTE ports + let opte0 = + OptePort::new("opte0", "10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + opte0.add_router_entry("10.0.0.2")?; + opte0.add_router_entry("10.0.0.3")?; + opte0.fw_allow_all()?; + + let opte1 = + OptePort::new("opte1", "10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + opte1.add_router_entry("10.0.0.1")?; + opte1.add_router_entry("10.0.0.3")?; + opte1.fw_allow_all()?; + + let opte2 = + OptePort::new("opte2", "10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + opte2.add_router_entry("10.0.0.1")?; + opte2.add_router_entry("10.0.0.2")?; + opte2.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + println!("adding underlay route 2"); + let r2 = + RouteV6::new(opte2.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte3node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_c_name}"); + let c = OpteZone::new(zone_c_name, &zfs, &[&opte2.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup(&opte0.name, opte0.ip())?; + + println!("setup zone {zone_b_name}"); + b.setup(&opte1.name, opte1.ip())?; + + println!("setup zone {zone_c_name}"); + c.setup(&opte2.name, opte2.ip())?; + + Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + TestNode { zone: c, port: opte2 }, + ], null_ports: vec![], + v6_routes: vec![r0, r1, r2], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), + zfs, }) } @@ -410,16 +886,16 @@ pub fn get_linklocal_addr(link_name: &str) -> Result { let text = std::str::from_utf8(&out.stdout)?; if !out.status.success() || text.lines().count() == 1 { - anyhow::bail!("could not find address {target_addr}"); + bail!("could not find address {target_addr}"); } let mut maybe_addr = text .lines() .nth(1) - .ok_or(anyhow::anyhow!("expected to find entry line for IP"))? + .ok_or(anyhow!("expected to find entry line for IP"))? .split_whitespace() .last() - .ok_or(anyhow::anyhow!("expected to find column for IP"))?; + .ok_or(anyhow!("expected to find column for IP"))?; // remove iface qualifier on link-local addr. if maybe_addr.contains('%') { @@ -443,7 +919,7 @@ pub fn single_node_over_real_nic( let max_macs = (1 << 20) - peers.len() - 1; if null_port_count > max_macs as u32 { - anyhow::bail!( + bail!( "Cannot allocate {null_port_count} ports: \ Oxide MAC space admits {max_macs} accounting for peers" ); @@ -522,13 +998,13 @@ pub fn single_node_over_real_nic( a.setup(&opte.name, opte.ip())?; Ok(Topology { + nodes: vec![TestNode { zone: a, port: opte }], + null_ports, + v6_routes, xde, lls: vec![], vnics: vec![], simnet: None, - nodes: vec![TestNode { zone: a, port: opte }], - null_ports, - v6_routes, zfs, }) } diff --git a/xde-tests/tests/loopback.rs b/xde-tests/tests/loopback.rs index c64990a8..4ceb8b52 100644 --- a/xde-tests/tests/loopback.rs +++ b/xde-tests/tests/loopback.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company use anyhow::Result; diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs new file mode 100644 index 00000000..64f586d9 --- /dev/null +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -0,0 +1,363 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast multiple subscriber tests. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use opte_test_utils::geneve_verify; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_multicast_multiple_local_subscribers() -> Result<()> { + // Create 3-node topology to test local fanout + let topol = xde_tests::three_node_topology_named( + "omicron1", "mlsa", "mlsb", "mlsc", + )?; + + // IPv4 multicast group: 224.1.2.3 + let mcast_group = Ipv4Addr::from([224, 1, 2, 3]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 3, + ]); + + // Set up multicast state with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Node B's underlay address for forwarding + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with External replication + // This will deliver to all local subscribers in the same VNI + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Start snoops on nodes B and C using SnoopGuard + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + // Also snoop underlay to verify NO underlay forwarding with External mode + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "fanout test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for both snoops to capture packets + let snoop_output_b = snoop_b + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on node B")?; + let snoop_output_c = snoop_c + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on node C")?; + + // Verify both nodes received the packet + let stdout_b = String::from_utf8_lossy(&snoop_output_b.stdout); + assert!( + snoop_output_b.status.success() && stdout_b.contains("UDP"), + "Expected to capture multicast UDP packet on node B, snoop output:\n{stdout_b}" + ); + + let stdout_c = String::from_utf8_lossy(&snoop_output_c.stdout); + assert!( + snoop_output_c.status.success() && stdout_c.contains("UDP"), + "Expected to capture multicast UDP packet on node C, snoop output:\n{stdout_c}" + ); + + // Verify NO underlay forwarding (External mode = local-only) + if let Ok(output) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) + { + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "External mode should NOT forward to underlay, but captured:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_multicast_underlay_replication() -> Result<()> { + // Create 2-node topology to test Underlay replication mode + let topol = xde_tests::two_node_topology_named("omicron1", "ura", "urb")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 4]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 4, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Debug: dump V2P/M2P mappings to verify M2P is set correctly + let hdl = OpteHdl::open()?; + let v2p_dump = hdl.dump_v2p()?; + println!("\n=== V2P/M2P Mappings ==="); + for vpc_map in &v2p_dump.mappings { + println!(" VNI {}: ", vpc_map.vni.as_u32()); + println!(" Unicast IPv4 mappings: {:?}", vpc_map.ip4); + println!(" Multicast IPv4 mappings: {:?}", vpc_map.mcast_ip4); + println!(" Multicast IPv6 mappings: {:?}", vpc_map.mcast_ip6); + } + println!("=== End V2P/M2P Mappings ===\n"); + + // Node B's underlay address + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with Underlay replication ONLY + // This should forward to underlay but NOT deliver to local ports + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + // + // Note: We deliberately do NOT subscribe any nodes to verify that Underlay mode + // forwards to underlay regardless of local subscription state (zero subscribers) + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Add IPv6 multicast route for admin-scoped multicast (ff04::/16) + // This tells the kernel to route multicast packets through the underlay interface + let route_add_result = std::process::Command::new("pfexec") + .args(&[ + "route", + "add", + "-inet6", + "ff04::/16", + "-interface", + "xde_test_vnic0", + ]) + .output() + .context("Failed to add IPv6 multicast route")?; + if !route_add_result.status.success() { + println!( + "Warning: Failed to add IPv6 multicast route: {}", + String::from_utf8_lossy(&route_add_result.stderr) + ); + } + + // Start snoop on the UNDERLAY simnet device (not the OPTE port) + // to verify the packet is forwarded to the underlay + let underlay_dev = "xde_test_sim1"; // Underlay device + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; // Geneve port + + // Debug: dump forwarding table to verify configuration + let mfwd = hdl.dump_mcast_fwd()?; + println!("\n=== Multicast forwarding table (Underlay test) ==="); + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + + // Also snoop node B's OPTE port to verify NO local delivery with Underlay mode + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + // Clear UFT right before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "underlay test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for snoop to capture the underlay packet + let snoop_output_underlay = snoop_underlay + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop on underlay")?; + + // Verify packet was forwarded to underlay + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected to capture Geneve packet on underlay, snoop output:\n{stdout_underlay}" + ); + + // Verify Geneve header fields (VNI, outer IPv6 dst, replication mode) + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, mcast_underlay, + "Outer IPv6 dst should be multicast underlay address" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::Underlay), + "Geneve replication mode should be Underlay" + ); + + // Verify NO local delivery (Underlay mode = remote-only) + if let Ok(output) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "Underlay mode should NOT deliver locally, but captured:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_multicast_all_replication() -> Result<()> { + // Create 3-node topology to test All replication mode (bifurcated delivery) + let topol = + xde_tests::three_node_topology_named("omicron1", "ara", "arb", "arc")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 5]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 5, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Node B's underlay address for underlay forwarding + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast forwarding with All replication + // This should deliver BOTH to local subscribers AND forward to underlay + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::All, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Start snoop on node B (local delivery) and underlay (underlay forwarding) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "all replication test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for both snoops to capture packets + let snoop_output_local = snoop_local + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for local delivery snoop")?; + let snoop_output_underlay = snoop_underlay + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for underlay snoop")?; + + // Verify local delivery happened + let stdout_local = String::from_utf8_lossy(&snoop_output_local.stdout); + assert!( + snoop_output_local.status.success() && stdout_local.contains("UDP"), + "Expected local delivery to node B, snoop output:\n{stdout_local}" + ); + + // Verify underlay forwarding happened + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected underlay forwarding, snoop output:\n{stdout_underlay}" + ); + + Ok(()) +} diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs new file mode 100644 index 00000000..f29d1697 --- /dev/null +++ b/xde-tests/tests/multicast_rx.rs @@ -0,0 +1,514 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast RX-path tests. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::Direction; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_xde_multicast_rx_ipv4() -> Result<()> { + // Create 2-node topology (IPv4 overlay: 10.0.0.0/24) + let topol = xde_tests::two_node_topology_named("omicron1", "rx4a", "rx4b")?; + + // IPv4 multicast group: 224.0.0.251 + let mcast_group = Ipv4Addr::from([224, 0, 0, 251]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: overlay layer needs IPv6 multicast underlay address + // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() + // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 0, 0, 251, + ]); + + // Node B's underlay address - this is where we'll forward multicast packets + // From two_node_topology: node B (10.0.0.2) has underlay fd77::1 + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Set up multicast forwarding with External replication for unicast delivery. + // Maps overlay IPv4 multicast group -> underlay IPv6 unicast address of node B + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target. + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + + // Allow outbound multicast traffic through the gateway layer + topol.nodes[0].port.allow_cidr(mcast_cidr, Direction::Out)?; + topol.nodes[1].port.allow_cidr(mcast_cidr, Direction::Out)?; + + // Add router entries for multicast + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Debug: dump multicast forwarding table + println!("\n=== Multicast forwarding table ==="); + let hdl = OpteHdl::open()?; + let mfwd = hdl.dump_mcast_fwd()?; + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + // Assert forwarding table contains expected next-hop + replication + let entry = mfwd + .entries + .iter() + .find(|e| e.group == mcast_group.into()) + .expect("missing multicast forwarding entry for group"); + assert!( + entry.next_hops.iter().any(|(nh, rep)| { + *rep == Replication::External + && nh.addr == node_b_underlay + && nh.vni == vni + }), + "expected External replication to {node_b_underlay:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Start snoop using SnoopGuard to ensure cleanup + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet to the multicast address from zone A using netcat + // nc -u: IPv4 UDP mode + // -w1: timeout after 1 second + let payload = "multicast test"; + let send_cmd = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send multicast UDP packet")?; + + // Wait for snoop to capture the packet (or timeout) + let snoop_output = snoop + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for snoop to capture multicast packet")?; + + // Check that snoop successfully captured a packet and validate basics + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + snoop_output.status.success() && !stdout.is_empty(), + "Expected to capture multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + // Protocol summary present + assert!( + stdout.contains("UDP"), + "expected UDP summary in snoop output:\n{stdout}" + ); + // Verify destination address appears in snoop output + // SnoopGuard uses -r flag, so we always get numeric addresses + assert!( + stdout.contains("224.0.0.251"), + "expected destination 224.0.0.251 in snoop output:\n{stdout}" + ); + // Payload present - check for substring in ASCII representation + // The full payload may wrap across lines, so just check for a distinctive part + assert!( + stdout.contains("ast test"), + "expected payload substring 'ast test' in ASCII portion of snoop output:\n{stdout}" + ); + // L2 dest: with current XDE/gateway pipeline, multicast RX to guests + // is delivered with broadcast dest MAC. snoop shows 16-bit grouped hex. + assert!( + stdout.to_ascii_lowercase().contains("ffff ffff ffff"), + "expected L2 broadcast MAC 'ffff ffff ffff' in snoop output; got:\n{stdout}" + ); + + // Unsubscribe receiver and verify no further local delivery + topol.nodes[1].port.unsubscribe_multicast(mcast_group.into())?; + + let mut snoop2 = SnoopGuard::start(&dev_name_b, &filter)?; + let send_cmd2 = + format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd2) + .context("Failed to send multicast UDP packet (post-unsubscribe)")?; + let res = snoop2.wait_with_timeout(Duration::from_millis(800)); + match res { + Ok(out) => { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "expected no local delivery after unsubscribe; snoop output:\n{stdout}" + ); + } + Err(_) => {} + } + Ok(()) +} + +#[test] +fn test_xde_multicast_rx_ipv6() -> Result<()> { + // Create 2-node topology with dual-stack (IPv4 + IPv6) + let topol = xde_tests::two_node_topology_dualstack_named( + "omicron1", "rx6a", "rx6b", + )?; + + // IPv6 multicast group: ff05::1:3 (site-local, all-dhcp-agents) + let mcast_group = Ipv6Addr::from([ + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: Map IPv6 multicast to admin-scoped underlay (ff04::/16) + // Per Omicron's map_external_to_underlay_ip(), convert ff05 -> ff04 + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Node B's underlay address + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + // Set up multicast forwarding with External replication for local delivery + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + // Allow IPv6 multicast traffic (ff05::/16 site-local) via Multicast target + let mcast_cidr = IpCidr::Ip6("ff05::/16".parse().unwrap()); + + // Add router entries for multicast + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Get the device names for snoop + let dev_name_b = topol.nodes[1].port.name().to_string(); + + // Start snoop using SnoopGuard to ensure cleanup + let filter = format!("udp and ip6 dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet to the multicast address from zone A using netcat + // nc -6 -u: IPv6 UDP mode + // -w1: timeout after 1 second + let payload = "multicast test v6"; + let sender_v6 = topol.nodes[0] + .port + .ipv6() + .expect("dualstack port must have IPv6 address"); + // illumos netcat selects IPv6 based on the destination; avoid `-6` for compatibility. + let send_cmd = format!( + "echo '{payload}' | nc -u -s {sender_v6} -w1 {mcast_group} {MCAST_PORT}" + ); + topol.nodes[0] + .zone + .zone + .zexec(&send_cmd) + .context("Failed to send IPv6 multicast UDP packet")?; + + // Wait for snoop to capture the packet (or timeout) + let snoop_output = + snoop.wait_with_timeout(Duration::from_secs(5)).context( + "Timeout waiting for snoop to capture IPv6 multicast packet", + )?; + + // Check that snoop successfully captured a packet + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + snoop_output.status.success() && !stdout.is_empty(), + "Expected to capture IPv6 multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + + Ok(()) +} + +#[test] +fn test_reject_link_local_underlay_ff02() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let link_local_underlay = Ipv6Addr::from([ + 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: link_local_underlay, + vni, + }); + assert!( + result.is_err(), + "Expected link-local underlay (ff02::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_reject_global_underlay_ff0e() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let global_underlay = Ipv6Addr::from([ + 0xff, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: global_underlay, + vni, + }); + assert!( + result.is_err(), + "Expected global underlay (ff0e::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_accept_admin_local_underlay_ff04() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let admin_local = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: admin_local, + vni, + }); + assert!( + result.is_ok(), + "Expected admin-local underlay (ff04::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_accept_site_local_underlay_ff05() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let site_local = Ipv6Addr::from([ + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: site_local, + vni, + }); + assert!( + result.is_ok(), + "Expected site-local underlay (ff05::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_accept_org_local_underlay_ff08() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let org_local = Ipv6Addr::from([ + 0xff, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay: org_local, + vni, + }); + assert!( + result.is_ok(), + "Expected org-local underlay (ff08::) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_reject_wrong_vni() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 100, + ]); + + let wrong_vni = Vni::new(1701u32)?; + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni: wrong_vni, + }); + assert!( + result.is_err(), + "Expected VNI 1701 to be rejected (must use DEFAULT_MULTICAST_VNI), got: {:?}", + result + ); + + Ok(()) +} + +#[test] +fn test_accept_default_multicast_vni() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 100, + ]); + + let correct_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni: correct_vni, + }); + assert!( + result.is_ok(), + "Expected DEFAULT_MULTICAST_VNI (77) to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_multicast_rx_no_relay_loop() -> Result<()> { + // Test RX loop-prevention: packets arriving from underlay with + // Replication::Underlay should NOT be re-relayed back to underlay. + // This prevents infinite relay loops. + + let topol = xde_tests::two_node_topology_named("omicron1", "lpa", "lpb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 200, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + // Set up forwarding with Underlay replication + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::Underlay, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port.subscribe_multicast(mcast_group.into())?; + } + + // Snoop the underlay to verify NO re-relay happens + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Simulate receiving a multicast packet FROM the underlay + // with Replication::Underlay already set (indicating it came from another host). + // Build a Geneve packet with the Underlay replication bit set. + let hdl = OpteHdl::open()?; + + // We need to inject a packet on the underlay that looks like it came from + // another host. Unfortunately, we can't easily inject raw packets in the test + // environment without significant plumbing. Instead, we verify the logic + // indirectly by checking that the dtrace probe shows the right behavior. + + // For now, document the expected behavior and add a TODO for full integration + // test once we have packet injection capability. + println!("\n=== RX Loop Prevention Test ==="); + println!("Expected behavior: Packets arriving from underlay with"); + println!("Replication::Underlay should NOT be re-relayed."); + println!("\nThis requires packet injection capability to fully test."); + println!( + "Current implementation checks incoming delivery mode in Geneve options" + ); + println!("and only relays if delivery_mode is Underlay or All."); + + // Verify the multicast forwarding table is set up correctly + let mfwd = hdl.dump_mcast_fwd()?; + println!("\n=== Multicast forwarding table ==="); + for entry in &mfwd.entries { + println!( + " Group: {:?}, Next hops: {:?}", + entry.group, entry.next_hops + ); + } + + // Since we can't inject packets easily, verify NO spurious underlay traffic + // by waiting to ensure nothing appears on underlay without us sending anything + let snoop_result = snoop_underlay.wait_with_timeout(Duration::from_secs(2)); + + match snoop_result { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + assert!( + stdout.is_empty(), + "No multicast traffic should appear on underlay without a sender:\n{stdout}" + ); + } + Err(_) => { + // Timeout is expected - no packets should appear + } + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs new file mode 100644 index 00000000..5d472281 --- /dev/null +++ b/xde-tests/tests/multicast_validation.rs @@ -0,0 +1,239 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Validation tests covering multicast operations. + +use anyhow::Context; +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_subscribe_nonexistent_port() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + + // Try to subscribe non-existent port + let result = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: "this_port_does_not_exist_anywhere".to_string(), + group: mcast_group.into(), + }); + + // Should return error, not panic or succeed + assert!( + result.is_err(), + "Expected error when subscribing non-existent port, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_unicast_ip_as_group() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "unia", "unib")?; + let hdl = OpteHdl::open()?; + + // Try to subscribe to unicast IP (not multicast) - should be rejected + let unicast_ip = Ipv4Addr::from([10, 0, 0, 1]); + let result = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: unicast_ip.into(), + }); + + // Should reject non-multicast addresses + match result { + Ok(_) => { + panic!("Expected error when subscribing to unicast IP, got Ok") + } + Err(e) => { + assert!( + format!("{e:?}").contains("not a multicast address"), + "Expected 'not a multicast address' error, got: {e:?}", + ); + } + } + + Ok(()) +} + +#[test] +fn test_double_subscribe() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "dsa", "dsb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 101]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 101, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Subscribe once + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Subscribe again (should be idempotent) + let result = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); + + // Should succeed (idempotent operation) + assert!( + result.is_ok(), + "Double subscribe should be idempotent, got error: {:?}", + result + ); + + // Verify delivery works and packet is NOT duplicated + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(topol.nodes[1].port.name(), &filter)?; + + topol.nodes[0].zone.zone.zexec(&format!( + "echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}" + ))?; + + let output = snoop + .wait_with_timeout(Duration::from_secs(5)) + .context("Timeout waiting for multicast delivery")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + + // Verify packet received + assert!( + output.status.success() && stdout.contains("UDP"), + "Should receive multicast after double subscribe:\n{stdout}" + ); + + // Count occurrences - should be 1, not 2 (no duplication) + let count = stdout.matches("UDP").count(); + assert!( + count == 1, + "Packet should be delivered once, not duplicated. Found {count} deliveries" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_never_subscribed() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "usa", "usb")?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 102]); + + // Try to unsubscribe without ever subscribing + let result = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: mcast_group.into(), + }); + + // Expected: Ok (no-op). Unsubscribe is idempotent for existing ports. + assert!( + result.is_ok(), + "Unsubscribe should be a no-op (Ok), got: {result:?}" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_then_clear_m2p() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "sca", "scb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 103]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 103, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + + let node_b_underlay = Ipv6Addr::from([ + 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + ]); + + mcast.set_forwarding(vec![( + NextHopV6::new(node_b_underlay, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + + // Clear M2P while subscription active + let hdl = OpteHdl::open()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: mcast_group.into(), + underlay, + vni, + })?; + + // Start snoops to verify NO delivery occurs after M2P clear + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send packet - command should execute successfully regardless of delivery + let result = topol.nodes[0] + .zone + .zone + .zexec(&format!("echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}")); + + // Expected: Ok (command executed). Delivery should NOT occur. + assert!(result.is_ok(), "Send after M2P clear should succeed: {result:?}"); + + // Verify no local delivery + if let Ok(out) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("No local delivery expected; got:\n{stdout}"); + } + + // Verify no underlay forwarding (encap denied without M2P) + if let Ok(out) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "No underlay forwarding expected after M2P clear; got:\n{stdout}" + ); + } + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 4cc00f50..43e95e9e 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -11,12 +11,13 @@ use alloc::collections::btree_map::Entry; use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; -use opte::api::Ipv6Addr; +use opte::api::IpAddr; use opte::api::MacAddr; use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; +use opte::ddi::sync::KRwLockWriteGuard; /// A map/set lookup key for ports indexed on `(Vni, MacAddr)`. /// @@ -31,6 +32,11 @@ impl VniMac { pub fn new(vni: Vni, mac: MacAddr) -> Self { VniMac(vni.as_u32(), mac_to_u64(mac)) } + + #[inline] + pub fn vni(&self) -> Vni { + Vni::new(self.0).expect("VniMac contains valid VNI") + } } type Dev = Arc; @@ -45,7 +51,7 @@ type Dev = Arc; pub struct DevMap { devs: BTreeMap, names: BTreeMap, - mcast_groups: BTreeMap>, + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -75,17 +81,32 @@ impl DevMap { /// Remove an `XdeDev` using its name. pub fn remove(&mut self, name: &str) -> Option { let key = get_key(&self.names.remove(name)?); + + // Clean up all multicast group subscriptions for this port + self.mcast_groups.retain(|_group, subscribers| { + subscribers.remove(&key); + !subscribers.is_empty() + }); + self.devs.remove(&key) } /// Allow a port to receive on a given multicast group. /// /// This takes the overlay (outer v6) multicast group address. - pub fn multicast_subscribe( + pub fn mcast_subscribe( &mut self, name: &str, - mcast_ip: Ipv6Addr, + mcast_ip: IpAddr, ) -> Result<(), OpteError> { + // Validate that the IP is actually a multicast address + if !mcast_ip.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + mcast_ip + ))); + } + let port = self .get_by_name(name) .ok_or_else(|| OpteError::PortNotFound(name.into()))?; @@ -98,10 +119,10 @@ impl DevMap { } /// Rescind a port's ability to receive on a given multicast group. - pub fn multicast_unsubscribe( + pub fn mcast_unsubscribe( &mut self, name: &str, - mcast_ip: Ipv6Addr, + mcast_ip: IpAddr, ) -> Result<(), OpteError> { let port = self .get_by_name(name) @@ -117,9 +138,9 @@ impl DevMap { } /// Find the keys for all ports who want to receive a given multicast packet. - pub fn multicast_listeners( + pub fn mcast_listeners( &self, - mcast_ip: &Ipv6Addr, + mcast_ip: &IpAddr, ) -> Option> { self.mcast_groups.get(mcast_ip).map(|v| v.iter()) } @@ -190,4 +211,8 @@ impl ReadOnlyDevMap { pub fn read(&self) -> KRwLockReadGuard<'_, DevMap> { self.0.read() } + + pub fn write(&self) -> KRwLockWriteGuard<'_, DevMap> { + self.0.write() + } } diff --git a/xde/src/stats.rs b/xde/src/stats.rs index 53a57076..02518ab8 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -55,9 +55,72 @@ pub struct XdeStats { out_drop_misc: KStatU64, // NOTE: tun_opt is not relevant to outbound packets -- no encapsulation // is in use. + /// The number of multicast packets delivered to external/customer + /// members (decapsulated packets to local guest instances). + mcast_tx_external: KStatU64, + /// The number of multicast packets forwarded to underlay/infrastructure + /// members (encapsulated Geneve packets to infrastructure destinations). + mcast_tx_underlay: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during external delivery. + mcast_tx_stale_external: KStatU64, + + /// The number of multicast packets received and delivered to external/customer + /// members (decapsulated packets to local guest instances). + mcast_rx_external: KStatU64, + /// The number of multicast packets received and forwarded to underlay/infrastructure + /// members (re-encapsulated Geneve packets to infrastructure destinations). + mcast_rx_underlay: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during Rx external delivery. + mcast_rx_stale_external: KStatU64, + /// The number of multicast packets received with no forwarding entry. + mcast_rx_no_fwd_entry: KStatU64, + /// The number of times a pullup operation failed during multicast TX + /// (packet replication), causing a packet to be dropped. + mcast_tx_pullup_fail: KStatU64, + /// The number of times a pullup operation failed during multicast RX + /// (packet delivery/relay), causing a packet to be dropped. + mcast_rx_pullup_fail: KStatU64, } impl XdeStats { + pub fn mcast_tx_external(&self) -> &KStatU64 { + &self.mcast_tx_external + } + + pub fn mcast_tx_underlay(&self) -> &KStatU64 { + &self.mcast_tx_underlay + } + + pub fn mcast_tx_stale_external(&self) -> &KStatU64 { + &self.mcast_tx_stale_external + } + + pub fn mcast_rx_external(&self) -> &KStatU64 { + &self.mcast_rx_external + } + + pub fn mcast_rx_underlay(&self) -> &KStatU64 { + &self.mcast_rx_underlay + } + + pub fn mcast_rx_stale_external(&self) -> &KStatU64 { + &self.mcast_rx_stale_external + } + + pub fn mcast_rx_no_fwd_entry(&self) -> &KStatU64 { + &self.mcast_rx_no_fwd_entry + } + + pub fn mcast_tx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_tx_pullup_fail + } + + pub fn mcast_rx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_rx_pullup_fail + } + pub fn parse_error(&self, dir: Direction, err: &ParseError) { use Direction::*; (match (dir, err) { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 9ce7b6de..7267e581 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -153,6 +153,7 @@ use crate::sys::ncpus; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; +use alloc::collections::BTreeMap; use alloc::ffi::CString; use alloc::string::String; use alloc::string::ToString; @@ -168,6 +169,7 @@ use core::ptr::addr_of_mut; use core::time::Duration; use illumos_sys_hdrs::mac::MacEtherOffloadFlags; use illumos_sys_hdrs::mac::MblkOffloadFlags; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; use illumos_sys_hdrs::*; use ingot::geneve::Geneve; use ingot::geneve::GeneveOpt; @@ -224,21 +226,32 @@ use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastForwardingEntry; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::cfg::IpCfg; @@ -288,6 +301,30 @@ unsafe extern "C" { dst_port: uintptr_t, ); pub safe fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); + pub safe fn __dtrace_probe_mcast__tx( + af: uintptr_t, // AF_INET or AF_INET6 + inner_dst: uintptr_t, // *const Ipv4Addr or *const Ipv6Addr + vni: uintptr_t, + replication: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__rx( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + replication: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__local__delivery( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + port: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__underlay__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); } fn bad_packet_parse_probe( @@ -380,6 +417,10 @@ struct XdeState { struct XdeMgmt { devs: Arc>, underlay: Option, + /// XDE-wide multicast forwarding table mapping multicast group addresses + /// to their physical next hops with replication information. + /// Maps: IpAddr (overlay multicast group) -> BTreeMap + mcast_fwd: Arc>>>, } #[derive(Clone)] @@ -403,6 +444,7 @@ fn get_xde_state() -> &'static XdeState { impl XdeState { fn new() -> Self { + #[allow(clippy::arc_with_non_send_sync)] let ectx = Arc::new(ExecCtx { log: Box::new(opte::KernelLog {}) }); let dev_map = Arc::new(KRwLock::new(DevMap::default())); let devs = ReadOnlyDevMap::new(dev_map.clone()); @@ -411,6 +453,7 @@ impl XdeState { management_lock: TokenLock::new(XdeMgmt { devs: dev_map, underlay: None, + mcast_fwd: Arc::new(KRwLock::new(BTreeMap::new())), }), devs, ectx, @@ -871,6 +914,41 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { let resp = remove_cidr_hdlr(&mut env); hdlr_resp(&mut env, resp) } + + OpteCmd::SetMcastForwarding => { + let resp = set_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcastForwarding => { + let resp = clear_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastForwarding => { + let resp = dump_mcast_forwarding_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastSubscribe => { + let resp = mcast_subscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribe => { + let resp = mcast_unsubscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::SetMcast2Phys => { + let resp = set_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcast2Phys => { + let resp = clear_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } } } @@ -1239,6 +1317,9 @@ fn clear_xde_underlay() -> Result { }); } + // Clear multicast forwarding table to release any references + token.mcast_fwd.write().clear(); + if let Some(underlay) = token.underlay.take() { // If the underlay references have leaked/spread beyond `XdeDev`s and not // been cleaned up, we committed have a fatal programming error. @@ -1866,6 +1947,703 @@ fn guest_loopback( } } +/// Locate the Oxide Multicast Geneve option and return the offset to its body. +/// +/// Walks through Geneve options starting at `geneve_offset + 8` to find the +/// Oxide Multicast option (class=0x0129, type=0x01). Returns the offset to the +/// option body (after the 4-byte option header) if found. +/// +/// # Safety +/// This function validates option headers as it walks to avoid reading beyond +/// packet boundaries. Returns None if the option is not found or if validation fails. +/// +/// # Geneve Option Format +/// Each option consists of: +/// - 2 bytes: Option class +/// - 1 byte: Flags (bit 7=critical) + Type (bits 0-6) +/// - 1 byte: Reserved (3 bits) + Length in 4-byte words (5 bits) +/// - N bytes: Option data (N = length field * 4) +fn find_mcast_option_offset( + pkt: &MsgBlk, + geneve_offset: usize, +) -> Option { + const GENEVE_HDR_LEN: usize = 8; + const OPT_HDR_LEN: usize = 4; + const OXIDE_OPT_CLASS: u16 = 0x0129; + const MULTICAST_OPT_TYPE: u8 = 0x01; + + // Read Geneve header to get option length + let geneve_hdr = pkt.get(geneve_offset..geneve_offset + GENEVE_HDR_LEN)?; + let opt_len_words = (geneve_hdr[0] & 0x3F) as usize; // Bottom 6 bits of first byte + + if opt_len_words == 0 { + return None; // No options present + } + + let opts_start = geneve_offset + GENEVE_HDR_LEN; + let opts_end = opts_start + (opt_len_words * 4); + + // Belt-and-braces: ensure options area doesn't exceed packet length + if opts_end > pkt.len() { + return None; + } + + let mut offset = opts_start; + + while offset + OPT_HDR_LEN <= opts_end { + let opt_hdr = pkt.get(offset..offset + OPT_HDR_LEN)?; + + let class = u16::from_be_bytes([opt_hdr[0], opt_hdr[1]]); + let opt_type = opt_hdr[2] & 0x7F; // Mask out critical bit + let opt_data_words = (opt_hdr[3] & 0x1F) as usize; // Bottom 5 bits + let opt_data_len = opt_data_words * 4; + + if class == OXIDE_OPT_CLASS && opt_type == MULTICAST_OPT_TYPE { + // Found it! Return offset to option body + return Some(offset + OPT_HDR_LEN); + } + + // Move to next option + offset += OPT_HDR_LEN + opt_data_len; + } + + None +} + +/// Update the Oxide Multicast Geneve option's replication field. +/// +/// Locates the multicast option and rewrites the replication strategy in the +/// first byte of the option body (top 2 bits encode the replication mode). +/// +/// Returns `true` if the option was found and updated, `false` otherwise. +/// +/// # Replication Encoding +/// The replication field uses the top 2 bits of the first byte: +/// - `External` (0): 0x00 +/// - `Underlay` (1): 0x40 +/// - `All` (2): 0x80 +/// - `Reserved` (3): 0xC0 +#[inline] +fn update_mcast_replication( + pkt: &mut MsgBlk, + geneve_offset: usize, + replication: Replication, +) -> bool { + let Some(mcast_body_off) = find_mcast_option_offset(pkt, geneve_offset) + else { + return false; + }; + + let Some(rep_byte) = pkt.get_mut(mcast_body_off..mcast_body_off + 1) else { + return false; + }; + + // Encode replication in top 2 bits, preserve bottom 6 bits + let repl_bits = (replication as u8) << 6; + rep_byte[0] = (rep_byte[0] & 0x3F) | repl_bits; + true +} + +/// Compute the combined replication strategy from a set of next hops. +/// +/// Starts from the first hop's replication and folds the rest using +/// `Replication::merge()` to avoid biasing toward `External`. +/// Returns `None` if `next_hops` is empty. +#[inline] +fn compute_replication_strategy( + next_hops: &BTreeMap, +) -> Option { + let mut acc: Option = None; + for repl in next_hops.values().copied() { + acc = Some(match acc { + None => repl, + Some(cur) => cur.merge(repl), + }); + } + acc +} + +struct MulticastTxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + vni: Vni, + out_pkt: &'a MsgBlk, + encap_len: u32, + inner_eth_len: usize, + non_eth_payl_bytes: u32, + tun_meoi: &'a mac_ether_offload_info_t, + l4_hash: u32, +} + +struct MulticastRxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + vni: Vni, + pkt: &'a MsgBlk, + pullup_len: usize, + geneve_offset: usize, + incoming_delivery_mode: Option, +} + +/// Handle multicast packet forwarding for both external/customer and +/// underlay/infrastructure delivery based on the XDE-wide multicast +/// forwarding table. +/// +/// - External: Customer-facing members, local guest instances (decapsulated) +/// - Underlay: Infrastructure members, underlay destinations (encapsulated Geneve) +fn handle_mcast_tx<'a>( + ctx: MulticastTxContext, + src_dev: &'a XdeDev, + postbox: &mut TxPostbox, + entry_state: &mut Option>>, +) { + // DTrace probe: capture TX entry + let (af, inner_addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + + // Determine replication strategy from XDE-wide multicast forwarding table + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + // Compute combined replication strategy from all next hops to govern local delivery. + let delivery_mode = mcast_fwd + .get(&ctx.inner_dst) + .and_then(compute_replication_strategy) + .unwrap_or(Replication::External); + + // Drop locks before potentially expensive operations + drop(mcast_fwd); + drop(mgmt); + + // DTrace probe: multicast TX entry with delivery mode + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__tx( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + delivery_mode as uintptr_t, + ); + + // External/customer delivery if delivery mode is External or All + // Delivers decapsulated packets to customer-facing members in the same VNI + let do_external = matches!( + delivery_mode, + oxide_vpc::api::Replication::External + | oxide_vpc::api::Replication::All + ); + + if do_external { + let entry_state = + entry_state.get_or_insert_with(|| src_dev.port_map.read()); + if let Some(others) = entry_state.mcast_listeners(&ctx.inner_dst) { + let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); + for el in others { + // Filter by VNI - only deliver to listeners in the same VNI + if el.vni() != ctx.vni { + continue; + } + if my_key == *el { + continue; + } + + // This is a more lightweight clone in illumos, and + // gives us an owned form of the headers but a ref + // counted clone of the packet body. + // + // If there are any body transforms internally, OPTE + // will fully clone out the contents if required. + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let Ok(my_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast TX external pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + continue; + }; + match entry_state.get_by_key(*el) { + Some(dev) => { + // DTrace probe: local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + 2usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + 26usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + guest_loopback(src_dev, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_stale_external().incr(1); + } + } + } + } + } + + // Underlay/infrastructure forwarding only if the merged delivery mode + // calls for it. External-only means local delivery only, no underlay fanout. + let do_underlay = matches!( + delivery_mode, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ); + + if do_underlay { + // Re-acquire locks for underlay forwarding + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { + // We found forwarding entries, replicate to each next hop + for (next_hop, replication) in next_hops.iter() { + // Clone packet with headers using pullup + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let Ok(mut fwd_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast TX underlay pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + continue; // Skip this destination on allocation failure + }; + + // Modify VNI in Geneve header to next_hop.vni + // Geneve header follows outer Ethernet + IPv6 + UDP + let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) + + usize::from(ctx.tun_meoi.meoi_l3hlen) + + usize::from(ctx.tun_meoi.meoi_l4hlen); + + // Determine the actual outer IPv6 destination and whether to modify it + // - External: Override with unicast next_hop.addr for delivery to specific host + // - Underlay/All: Keep the multicast underlay address from OPTE (already set via M2P) + let ipv6_offset = usize::from(ctx.tun_meoi.meoi_l2hlen); + let actual_outer_dst = match replication { + oxide_vpc::api::Replication::External => { + // External replication: override with unicast destination + let ipv6_dst_offset = ipv6_offset + 24; + if let Some(dst_bytes) = fwd_pkt + .get_mut(ipv6_dst_offset..ipv6_dst_offset + 16) + { + dst_bytes.copy_from_slice(AsRef::<[u8]>::as_ref( + &next_hop.addr, + )); + } + next_hop.addr // Use unicast address for routing + } + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All => { + // Underlay/All replication: The packet already has the correct + // multicast underlay address from OPTE's M2P mapping. + // Do NOT override it - just get it for route lookup + let xde = get_xde_state(); + match xde + .vpc_map + .get_mcast_underlay(ctx.vni, ctx.inner_dst) + { + Some(mcast_ul) => mcast_ul.addr(), // Use multicast address for routing + None => { + // No M2P mapping - skip this destination + continue; + } + } + } + _ => { + // Reserved or unknown replication type - skip + continue; + } + }; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits + } + // Update Geneve multicast option to reflect underlay replication to prevent re-relay loops. + update_mcast_replication( + &mut fwd_pkt, + geneve_offset, + *replication, + ); + + // Route lookup for next hop to get outer MAC addresses + // Use the actual_outer_dst we determined above + let route_key = RouteKey { + dst: actual_outer_dst, + l4_hash: Some(ctx.l4_hash), + }; + let Route { src: mac_src, dst: mac_dst, underlay_idx } = + src_dev.routes.next_hop(route_key, src_dev); + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(mac_dst.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + + // Note: The outer IPv6 destination was already set correctly in fwd_pkt + // based on the replication type, and we used the correct address for + // route lookup, so no need to modify it here. + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // DTrace probe: underlay forwarding + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + + // Send to underlay + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); + + // Increment underlay forwarding stat + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + } + + // Release locks + drop(mcast_fwd); + drop(mgmt); + } + } +} + +/// Handle multicast packet reception from the underlay. +/// +/// This function processes incoming multicast packets and: +/// - Delivers to external/customer members in the same VNI (local listeners) +/// - Optionally forwards to underlay/infrastructure members (if acting as relay) +/// +/// Unlike Tx path which originates from a port, Rx path receives from underlay +/// and needs to determine all appropriate destinations. +fn handle_mcast_rx( + ctx: MulticastRxContext, + stream: &DlsStream, + devs: &DevMap, + postbox: &mut Postbox, +) { + // DTrace probe: multicast RX entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__rx( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + ctx.incoming_delivery_mode.map(|r| r as uintptr_t).unwrap_or(0), + ); + + // Determine replication strategy from XDE-wide multicast forwarding table + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + // Compute combined replication strategy from all next hops + let has_fwd_entry = mcast_fwd.get(&ctx.inner_dst).is_some(); + let delivery_mode = mcast_fwd + .get(&ctx.inner_dst) + .and_then(compute_replication_strategy) + .unwrap_or(Replication::External); + + // Drop locks before potentially expensive operations + drop(mcast_fwd); + drop(mgmt); + + // If no forwarding entry exists, check for local listeners only + if !has_fwd_entry { + if let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { + // Deliver to local listeners in the same VNI only + for el in ports { + // Filter by VNI - only deliver to listeners in the incoming packet's VNI + if el.vni() != ctx.vni { + continue; + } + + let Ok(my_pkt) = + ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast RX external pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: RX local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + 2usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + 26usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_external().incr(1); + } + } + } + } else { + // No forwarding entry and no local listeners + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_no_fwd_entry().incr(1); + } + return; + } + + // External/customer delivery if delivery mode is External or All. + // + // Loop Prevention: If the incoming packet has Underlay or All replication set, + // it means this packet has already been relayed by another host and we should + // NOT deliver it locally. This prevents: + // - Duplicate delivery to local listeners + // - Infinite forwarding loops in the underlay network + let do_external = matches!( + delivery_mode, + oxide_vpc::api::Replication::External + | oxide_vpc::api::Replication::All + ) && !matches!( + ctx.incoming_delivery_mode, + Some(oxide_vpc::api::Replication::Underlay) + | Some(oxide_vpc::api::Replication::All) + ); + + if do_external && let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { + // Deliver to local listeners in the same VNI only + for el in ports { + // Filter by VNI - only deliver to listeners in the incoming packet's VNI + if el.vni() != ctx.vni { + continue; + } + + let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast RX external pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: RX local delivery (with forwarding entry) + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_external().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_external().incr(1); + } + } + } + } + + // Underlay/infrastructure forwarding if delivery mode is Underlay or All + // For Rx path, this would mean we're acting as a multicast relay/router + // + // Loop prevention: Don't relay if incoming packet already has Underlay or All + // replication set in its Geneve option, as this indicates it has already been + // relayed by another host. + let should_relay = matches!( + delivery_mode, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ) && !matches!( + ctx.incoming_delivery_mode, + Some(oxide_vpc::api::Replication::Underlay) + | Some(oxide_vpc::api::Replication::All) + ); + + if should_relay { + // Re-acquire locks for underlay forwarding + let xde = get_xde_state(); + let mgmt = xde.management_lock.lock(); + let mcast_fwd = mgmt.mcast_fwd.read(); + + if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { + // Get routing info from any local device (all share same underlay) + let routing_dev = devs.iter().next(); + + for (next_hop, repl) in next_hops.iter() { + // Only forward to underlay destinations + if !matches!( + repl, + oxide_vpc::api::Replication::Underlay + | oxide_vpc::api::Replication::All + ) { + continue; + } + + // Clone the packet for this destination + let Ok(mut fwd_pkt) = + ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast RX underlay relay pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + continue; + }; + + // NOTE: For multicast underlay relaying, we do NOT modify the outer + // IPv6 destination. It's already set to the multicast underlay address + // (e.g., ff04::...224.1.2.4) by OPTE's encapsulation layer. + // The next_hop.addr is only used for routing/MAC lookup, which returns + // MAC addresses without modifying the packet. + + // Modify VNI in Geneve header to next_hop.vni + // Use the Geneve offset calculated from parsed headers to handle VLANs and IPv6 extensions + let geneve_offset = ctx.geneve_offset; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits + } + // Mark multicast replication as Underlay/All to avoid re-relay by downstream receivers. + update_mcast_replication(&mut fwd_pkt, geneve_offset, *repl); + + // Compute hash once for both routing and flow distribution + let l4_hash = { + use core::hash::Hash; + let mut hasher = crc32fast::Hasher::new(); + next_hop.addr.hash(&mut hasher); + hasher.finalize() + }; + + // Get routing information if we have a device + let (mac_src, mac_dst) = if let Some(dev) = routing_dev { + let route_key = + RouteKey { dst: next_hop.addr, l4_hash: Some(l4_hash) }; + let Route { src, dst, .. } = + dev.routes.next_hop(route_key, dev); + (src, dst) + } else { + // No devices available for routing - use zero MACs + use opte::engine::ether::EtherAddr; + (EtherAddr::zero(), EtherAddr::zero()) + }; + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(mac_dst.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // Send to underlay via stream (same underlay we received from) + stream.tx_drop_on_no_desc( + final_pkt, + TxHint::from_crc32(l4_hash), + MacTxFlags::empty(), + ); + + xde.stats.vals.mcast_rx_underlay().incr(1); + } + } + + drop(mcast_fwd); + drop(mgmt); + } +} + #[unsafe(no_mangle)] unsafe extern "C" fn xde_mc_tx( arg: *mut c_void, @@ -1980,6 +2758,21 @@ fn xde_mc_tx_one<'a>( let old_len = parsed_pkt.len(); let meta = parsed_pkt.meta(); + + // Extract inner destination IP for potential multicast processing + use opte::engine::ip::ValidL3; + use opte::engine::ip::v4::Ipv4Ref; + use opte::engine::ip::v6::Ipv6Ref; + let inner_dst_ip = match &meta.inner_l3 { + Some(ValidL3::Ipv4(v4)) => { + Some(oxide_vpc::api::IpAddr::from(v4.destination())) + } + Some(ValidL3::Ipv6(v6)) => { + Some(oxide_vpc::api::IpAddr::from(v6.destination())) + } + None => None, + }; + let Ok(non_eth_payl_bytes) = u32::try_from((&meta.inner_l3, &meta.inner_ulp).packet_length()) else { @@ -1987,6 +2780,8 @@ fn xde_mc_tx_one<'a>( return; }; + let inner_eth_len = meta.inner_eth.packet_length(); + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2008,6 +2803,9 @@ fn xde_mc_tx_one<'a>( return; } + // Multicast packets go through normal port.process() which will use M2P + // for encapsulation. After that, we intercept them for unicast replication. + let port = &src_dev.port; // The port processing code will fire a probe that describes what @@ -2091,45 +2889,50 @@ fn xde_mc_tx_one<'a>( return; }; - // For a multicast outbound frame, we need to attempt to deliver - // to all relevant local ports *and* over whichever underlay ports are - // required. - if ip6_dst.is_multicast() { - // TODO: fill in the mcast forwarding flags using The Table. - let entry_state = - entry_state.get_or_insert_with(|| src_dev.port_map.read()); - if let Some(others) = entry_state.multicast_listeners(&ip6_dst) - { - let my_key = VniMac::new(vni, src_dev.port.mac_addr()); - for el in others { - if my_key == *el { - continue; - } - - // This is a more lightweight clone in illumos, and - // gives us an owned form of the headers but a ref - // counted clone of the packet body. - // - // If there are any body transforms internally, OPTE - // will fully clone out the contents if required. - let Ok(my_pkt) = out_pkt.pullup(NonZeroUsize::new( - (encap_len as usize) - + (non_eth_payl_bytes as usize) - + Ethernet::MINIMUM_LENGTH, - )) else { - continue; - }; - match entry_state.get_by_key(*el) { - Some(dev) => guest_loopback( - src_dev, dev, *el, my_pkt, postbox, - ), - None => { - // TODO: log, error count, etc. - // Stale state caused this (probably) - } - } + // For a multicast outbound frame, deliver to external/customer members + // (local guest instances) and/or underlay/infrastructure members + // based on the replication configuration. + // Check if this is a multicast packet by examining the outer IPv6 destination + // For multicast, OPTE should have set it to an ff0x:: address + let is_mcast_packet = ip6_dst.is_multicast(); + + if is_mcast_packet { + // This is a multicast packet - determine the inner destination + // from the packet contents or use a fallback + let inner_dst = inner_dst_ip.unwrap_or_else(|| { + // Fallback: derive from outer IPv6 multicast address + // For IPv4 multicast mapped to IPv6, the last 4 bytes contain the IPv4 address + if ip6_dst.bytes()[0] == 0xff && ip6_dst.bytes()[1] == 0x04 + { + // Admin-scoped IPv6 multicast, likely mapped from IPv4 + let bytes = ip6_dst.bytes(); + oxide_vpc::api::IpAddr::Ip4( + oxide_vpc::api::Ipv4Addr::from([ + bytes[12], bytes[13], bytes[14], bytes[15], + ]), + ) + } else { + // Use the IPv6 multicast address directly + oxide_vpc::api::IpAddr::Ip6(ip6_dst) } - } + }); + + handle_mcast_tx( + MulticastTxContext { + inner_dst, + vni, + out_pkt: &out_pkt, + encap_len, + inner_eth_len, + non_eth_payl_bytes, + tun_meoi: &tun_meoi, + l4_hash, + }, + src_dev, + postbox, + entry_state, + ); + return; } // 'MSS boosting' is performed here -- we set a 9k (minus overheads) @@ -2399,10 +3202,10 @@ fn new_port( // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? - gateway::setup(&pb, &cfg, vpc_map, FT_LIMIT_ONE, dhcp_cfg)?; + gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, vni_state, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, vni_state, vpc_map, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual @@ -2414,7 +3217,9 @@ fn new_port( let limit = NonZeroU32::new(FW_FT_LIMIT.get().max(nat_ft_limit.get())).unwrap(); let net = VpcNetwork { cfg }; - Ok(Arc::new(pb.create(net, limit, limit)?)) + #[allow(clippy::arc_with_non_send_sync)] + let port = Arc::new(pb.create(net, limit, limit)?); + Ok(port) } #[unsafe(no_mangle)] @@ -2544,9 +3349,7 @@ fn xde_rx_one( let old_len = parsed_pkt.len(); let ip6_dst = meta.outer_v6.destination(); - if ip6_dst.is_multicast() - && let Some(ports) = devs.multicast_listeners(&ip6_dst) - { + if ip6_dst.is_multicast() { let pullup_len = ( &meta.outer_eth, &meta.outer_v6, @@ -2557,25 +3360,49 @@ fn xde_rx_one( &meta.inner_ulp, ) .packet_length(); + debug_assert!( + pullup_len > 0, + "pullup_len should be non-zero for valid multicast packet" + ); + let vni = meta.outer_encap.vni(); + + // Extract inner destination IP for multicast processing + use opte::engine::ip::ValidL3; + use opte::engine::ip::v4::Ipv4Ref; + use opte::engine::ip::v6::Ipv6Ref; + let inner_dst = match &meta.inner_l3 { + ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), + ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), + }; + + // Extract multicast delivery mode from Geneve options + let incoming_delivery_mode = + oxide_vpc::engine::geneve::extract_multicast_replication( + &meta.outer_encap, + ); + + // Calculate Geneve offset from parsed outer header lengths (robust to VLANs and IPv6 extensions) + let geneve_offset = meta.outer_eth.packet_length() + + meta.outer_v6.packet_length() + + meta.outer_udp.packet_length(); + + // Drop the parsed packet before calling handle_mcast_rx drop(parsed_pkt); - for el in ports { - // As explained in `xde_mc_tx_one`, this is cheaper than a full - // packet copy and should be safe to process even in the presence - // of body transforms. - let Ok(my_pkt) = pkt.pullup(NonZeroUsize::new(pullup_len)) else { - continue; - }; - match devs.get_by_key(*el) { - Some(dev) => { - xde_rx_one_direct(stream, dev, *el, my_pkt, postbox) - } - None => { - // TODO: log, error count, etc. - // Stale state caused this (probably) - } - } - } + // Handle multicast packets using the XDE-wide forwarding table + handle_mcast_rx( + MulticastRxContext { + inner_dst, + vni, + pkt: &pkt, + pullup_len, + geneve_offset, + incoming_delivery_mode, + }, + stream, + devs, + postbox, + ); return None; } @@ -2882,6 +3709,63 @@ fn dump_v2p_hdlr() -> Result { Ok(state.vpc_map.dump()) } +#[unsafe(no_mangle)] +fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: SetMcast2PhysReq = env.copy_in_req()?; + + // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast + if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + // Propagate an actionable errno so userspace sees an error + errno: EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + req.vni.as_u32() + ), + }); + } + + // Validate underlay multicast address is admin-scoped IPv6 (ff04, ff05, or ff08) + // Per Omicron constraints: underlay must be admin-scoped for rack-internal routing + let first_byte = req.underlay.bytes()[0]; + let second_byte = req.underlay.bytes()[1]; + // Check if it's multicast (ff00::/8) and admin-scoped (ff04, ff05, ff08) + if first_byte != 0xff + || (second_byte != 0x04 && second_byte != 0x05 && second_byte != 0x08) + { + return Err(OpteError::InvalidUnderlayMulticast(format!( + "underlay multicast address must be admin-scoped IPv6 (ff04::/16, ff05::/16, or ff08::/16), got: {}", + req.underlay + ))); + } + + let state = get_xde_state(); + state.vpc_map.add_mcast(req.group, req.underlay, req.vni)?; + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: ClearMcast2PhysReq = env.copy_in_req()?; + + // Validate VNI is DEFAULT_MULTICAST_VNI (77) for fleet-level multicast + if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + req.vni.as_u32() + ), + }); + } + + let state = get_xde_state(); + state.vpc_map.del_mcast(req.group, req.underlay, req.vni); + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn set_v2b_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetVirt2BoundaryReq = env.copy_in_req()?; @@ -2904,6 +3788,121 @@ fn dump_v2b_hdlr() -> Result { Ok(state.v2b.dump()) } +#[unsafe(no_mangle)] +fn set_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: SetMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for any next hop + // that will result in underlay forwarding (Underlay/All). + for (nh, rep) in &req.next_hops { + if matches!(rep, Replication::Underlay | Replication::All) + && nh.vni.as_u32() != DEFAULT_MULTICAST_VNI + { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast next-hop VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", + DEFAULT_MULTICAST_VNI, + nh.vni.as_u32() + ), + }); + } + } + + let token = state.management_lock.lock(); + let mut mcast_fwd = token.mcast_fwd.write(); + + // Convert Vec into BTreeMap + let next_hop_map: BTreeMap = + req.next_hops.into_iter().collect(); + + mcast_fwd.insert(req.group, next_hop_map); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: ClearMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mut mcast_fwd = token.mcast_fwd.write(); + + mcast_fwd.remove(&req.group); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn dump_mcast_forwarding_hdlr() -> Result { + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mcast_fwd = token.mcast_fwd.read(); + + let entries: Vec = mcast_fwd + .iter() + .map(|(group, next_hops)| McastForwardingEntry { + group: *group, + next_hops: next_hops.iter().map(|(nh, rep)| (*nh, *rep)).collect(), + }) + .collect(); + + Ok(DumpMcastForwardingResp { entries }) +} + +#[unsafe(no_mangle)] +fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: McastSubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by TX/RX + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + devs.mcast_subscribe(&req.port_name, req.group)?; + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by TX/RX + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + devs.mcast_unsubscribe(&req.port_name, req.group)?; + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + ); + } + + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn list_layers_hdlr( env: &mut IoctlEnvelope, From f74d217c0493995b66551a90a00751a035537e31 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Tue, 11 Nov 2025 06:55:15 +0000 Subject: [PATCH 6/7] [review] Address concerns and refactor multicast work Updates all-around for IPv4/IPv6 multicast support with control-plane APIs, kernel TX/RX implementation, dtrace script, and documentation semantics. Includes: - Delivery semantics (leaf-node): - Remove multicast relay logic; OPTE is always a leaf node in the replication tree - Same-sled delivery happens unconditionally on TX for local subscribers - RX-path only handles packets destined to this sled (no forwarding) - Perf (avoid management_lock in datapath): - Move mcast_fwd lookups to per-entry state instead of hitting exclusive management lock during TX replication - Clone Arc references from per-CPU caches instead of holding per-port RwLock guards across packet processing - Use state.devs.read() for concurrent dataplane access - Hold per-CPU copies of mcast_fwd for duration of TX replication - Arbitrary VNI handling: - Use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast delivery - Remove per-VPC VNI checks in xde.rs; delegate validation to overlay layer - Packets with VNI 77 delivered to all subscribers regardless of VPC - Replication flag clarification: - Replication enum specifies switch behavior on marked packets: - External: Switch replicates to front panel ports (leaving underlay) - Underlay: Switch replicates to sleds (within underlay) - Both: Switch does both replications - Used only on TX-path to inform switch behavior, not for RX-path - Routing and MACs: - Now, we set the right nexthop and routing for TX replication (the switch unicast address) - Use derived IPv6 multicast MAC for outer destination - Route lookup determines underlay port selection via next_hop - Simplified underlay routing for admin-scoped (ff04::/16) addresses, matching Omicron currently - Test infra: - MulticastGroup: RAII cleanup for M2P/forwarding entries - SnoopGuard: Prevent leaked snoop processes from holding DLPI devices - Geneve packet verification with replication flag validation - three_node_topology for multi-subscriber scenarios - Proactive zone cleanup - Standardized around updated semantics - Additional refinements: - Updated DTrace script (opte-mcast-delivery.d) - Improved opteadm output formatting for multicast commands - Added anyhow dependency to opte-test-utils - Updated documentation clarifying multicast architecture --- .gitignore | 3 +- Cargo.lock | 1 + bin/opteadm/src/bin/opteadm.rs | 86 +- crates/opte-api/src/cmd.rs | 66 +- crates/opte-api/src/ip.rs | 51 +- crates/opte-api/src/mac.rs | 12 +- dtrace/README.adoc | 10 +- dtrace/opte-mcast-delivery.d | 405 +++++- lib/opte-ioctl/src/lib.rs | 7 + lib/opte-test-utils/Cargo.toml | 1 + lib/opte-test-utils/src/geneve_verify.rs | 20 +- lib/opte-test-utils/src/lib.rs | 34 +- lib/opte/README.adoc | 65 +- lib/opte/src/ddi/sync.rs | 26 + lib/opte/src/engine/predicate.rs | 4 + lib/opte/src/lib.rs | 2 +- lib/oxide-vpc/src/api.rs | 206 ++- lib/oxide-vpc/src/engine/gateway/mod.rs | 62 +- lib/oxide-vpc/src/engine/geneve.rs | 76 +- lib/oxide-vpc/src/engine/overlay.rs | 497 ++++--- lib/oxide-vpc/src/engine/router.rs | 102 +- lib/oxide-vpc/src/print.rs | 35 +- lib/oxide-vpc/tests/integration_tests.rs | 67 +- xde-tests/src/lib.rs | 217 ++- xde-tests/tests/multicast_multi_sub.rs | 500 +++++-- xde-tests/tests/multicast_rx.rs | 448 +++--- xde-tests/tests/multicast_validation.rs | 403 +++++- xde/src/dev_map.rs | 51 +- xde/src/postbox.rs | 6 + xde/src/stats.rs | 66 +- xde/src/xde.rs | 1652 +++++++++++++--------- 31 files changed, 3258 insertions(+), 1923 deletions(-) diff --git a/.gitignore b/.gitignore index 5956d6b9..f82d74c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ *.html target download -scripts -.DS_STORE \ No newline at end of file +.DS_STORE diff --git a/Cargo.lock b/Cargo.lock index eef3630a..b8a65043 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1312,6 +1312,7 @@ dependencies = [ name = "opte-test-utils" version = "0.1.0" dependencies = [ + "anyhow", "opte", "oxide-vpc", "pcap-parser", diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 706b14a4..c8a87727 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -61,6 +61,7 @@ use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::TunnelEndpoint; use oxide_vpc::api::VpcCfg; use oxide_vpc::print::print_mcast_fwd; +use oxide_vpc::print::print_mcast_subs; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; use std::io; @@ -232,30 +233,47 @@ enum Command { ClearV2B { prefix: IpCidr, tunnel_endpoint: Vec }, /// Set a multicast forwarding entry + /// + /// Adds or updates a next-hop for the specified underlay multicast address. + /// Multiple next-hops can be configured for the same underlay address by + /// running this command multiple times (like `swadm route add`). If the + /// same next-hop is specified again, its replication mode is updated. + /// + /// OPTE routes to `next_hop` (unicast switch address) to determine which + /// underlay port to use, then sends the packet to underlay (multicast) with + /// multicast MAC. The switch matches the on outer dst IP (multicast) and + /// Geneve replication tag. SetMcastFwd { - /// The multicast group address (IPv4 or IPv6) - group: IpAddr, - /// Next hop IPv6 address - next_hop_addr: Ipv6Addr, - /// Next hop VNI (defaults to fleet-level DEFAULT_MULTICAST_VNI) - #[arg(default_value_t = Vni::new(DEFAULT_MULTICAST_VNI).unwrap())] - next_hop_vni: Vni, - /// Delivery mode (replication): - /// - external: local guests in same VNI - /// - underlay: infrastructure via underlay multicast - /// - all: both local and underlay + /// The underlay multicast IPv6 address (admin-local scope ff04::/16). + /// This is the outer IPv6 destination in transmitted packets. + underlay: Ipv6Addr, + /// The unicast IPv6 address of the switch for routing (e.g., fd00::1). + /// OPTE uses this to determine which underlay port to use via the + /// illumos routing table. Multiple next-hops can be added by + /// running this command multiple times with the same underlay address. + next_hop: Ipv6Addr, + /// TX-only replication instruction (tells the switch which port groups to use): + /// - External: front panel ports (decapped, egress to external networks) + /// - Underlay: sled-to-sled ports (underlay multicast replication) + /// - Both: both external and underlay (bifurcated) + /// + /// Local same-sled delivery always happens via subscriptions regardless + /// of this setting. replication: Replication, }, /// Clear a multicast forwarding entry ClearMcastFwd { - /// The multicast group address (IPv4 or IPv6) - group: IpAddr, + /// The underlay multicast IPv6 address (admin-local scope ff04::/16) + underlay: Ipv6Addr, }, /// Dump the multicast forwarding table DumpMcastFwd, + /// Dump multicast subscriptions (group -> ports on this sled) + DumpMcastSubs, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -795,22 +813,38 @@ fn main() -> anyhow::Result<()> { hdl.clear_v2b(&req)?; } - Command::SetMcastFwd { - group, - next_hop_addr, - next_hop_vni, - replication, - } => { - let next_hop = NextHopV6::new(next_hop_addr, next_hop_vni); + Command::SetMcastFwd { underlay, next_hop, replication } => { + // OPTE routes to the next-hop's unicast address to determine which + // underlay port to use via the illumos routing table and DDM. + // + // The packet is then sent to the multicast address with a multicast + // MAC. + // + // The switch matches on the outer dst IP (multicast) and Geneve + // `Replication` tag to determine which port groups to replicate to: + // - External: front panel ports (which get decapped on egress) + // - Underlay: underlay ports (sleds) + // - Both: both (bifurcated) + // + // The Replication type is TX-only, RX ignores it and delivers + // locally based on subscriptions. + // + // Like `swadm route add`, this command can be run multiple times + // with the same underlay address to add multiple next-hops. If the + // same next-hop is specified again, its replication mode is updated. + + // Always use fleet-wide DEFAULT_MULTICAST_VNI + let next_hop_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let next_hop_addr = NextHopV6::new(next_hop, next_hop_vni); let req = SetMcastForwardingReq { - group, - next_hops: vec![(next_hop, replication)], + underlay, + next_hops: vec![(next_hop_addr, replication)], }; hdl.set_mcast_fwd(&req)?; } - Command::ClearMcastFwd { group } => { - let req = ClearMcastForwardingReq { group }; + Command::ClearMcastFwd { underlay } => { + let req = ClearMcastForwardingReq { underlay }; hdl.clear_mcast_fwd(&req)?; } @@ -818,6 +852,10 @@ fn main() -> anyhow::Result<()> { print_mcast_fwd(&hdl.dump_mcast_fwd()?)?; } + Command::DumpMcastSubs => { + print_mcast_subs(&hdl.dump_mcast_subs()?)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index de507062..5f8969f7 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -25,38 +25,39 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx - SetMcastForwarding = 100, // set multicast forwarding entries - ClearMcastForwarding = 101, // clear multicast forwarding entries - DumpMcastForwarding = 102, // dump multicast forwarding table - McastSubscribe = 103, // subscribe a port to a multicast group - McastUnsubscribe = 104, // unsubscribe a port from a multicast group - SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) - ClearMcast2Phys = 106, // clear M2P mapping + ListPorts = 1, // list all ports + AddFwRule = 20, // add firewall rule + RemFwRule = 21, // remove firewall rule + SetFwRules = 22, // set/replace all firewall rules at once + DumpTcpFlows = 30, // dump TCP flows + DumpLayer = 31, // dump the specified Layer + DumpUft = 32, // dump the Unified Flow Table + ListLayers = 33, // list the layers on a given port + ClearUft = 40, // clear the UFT + ClearLft = 41, // clear the given Layer's Flow Table + SetVirt2Phys = 50, // set a v2p mapping + DumpVirt2Phys = 51, // dump the v2p mappings + SetVirt2Boundary = 52, // set a v2b mapping + ClearVirt2Boundary = 53, // clear a v2b mapping + DumpVirt2Boundary = 54, // dump the v2b mappings + ClearVirt2Phys = 55, // clear a v2p mapping + AddRouterEntry = 60, // add a router entry for IP dest + DelRouterEntry = 61, // remove a router entry for IP dest + CreateXde = 70, // create a new xde device + DeleteXde = 71, // delete an xde device + SetXdeUnderlay = 72, // set xde underlay devices + ClearXdeUnderlay = 73, // clear xde underlay devices + SetExternalIps = 80, // set xde external IPs for a port + AllowCidr = 90, // allow ip block through gateway tx/rx + RemoveCidr = 91, // deny ip block through gateway tx/rx + SetMcastForwarding = 100, // set multicast forwarding entries + ClearMcastForwarding = 101, // clear multicast forwarding entries + DumpMcastForwarding = 102, // dump multicast forwarding table + McastSubscribe = 103, // subscribe a port to a multicast group + McastUnsubscribe = 104, // unsubscribe a port from a multicast group + SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) + ClearMcast2Phys = 106, // clear M2P mapping + DumpMcastSubscriptions = 107, // dump multicast subscription table } impl TryFrom for OpteCmd { @@ -96,6 +97,7 @@ impl TryFrom for OpteCmd { 104 => Ok(Self::McastUnsubscribe), 105 => Ok(Self::SetMcast2Phys), 106 => Ok(Self::ClearMcast2Phys), + 107 => Ok(Self::DumpMcastSubscriptions), _ => Err(()), } } diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 3da20d9c..41e93551 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -653,22 +653,24 @@ impl Ipv6Addr { self.inner[0] == 0xFF } - /// Return `true` if this is a multicast IPv6 address with administrative scope - /// (admin-local, site-local, or organization-local) as defined in RFC 4291 and RFC 7346. + /// Return `true` if this is a multicast IPv6 address with the ff04::/16 prefix + /// (admin-local scope with flags=0) as used by Omicron for underlay multicast. /// - /// The three administrative scopes are: - /// - `0x4`: admin-local scope - /// - `0x5`: site-local scope - /// - `0x8`: organization-local scope + /// This specifically checks for the ff04::/16 prefix where: + /// - First byte: 0xFF (all multicast addresses) + /// - Second byte: 0x04 (flags=0, scope=4 admin-local) + /// + /// See [RFC 7346] for details on IPv6 multicast address scopes and + /// how Omicron uses this specific address scope. + /// + /// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346.html pub const fn is_admin_scoped_multicast(&self) -> bool { if !self.is_multicast() { return false; } - // Extract the scope field from the lower 4 bits of the second byte - // (first byte is 0xFF for all multicast, second byte contains flags and scope) - let scope = self.inner[1] & 0x0F; - matches!(scope, 0x4 | 0x5 | 0x8) + // Check for ff04::/16 prefix only + self.inner[1] == 0x04 } /// Return the bytes of the address. @@ -1183,24 +1185,18 @@ impl Ipv6Cidr { prefix_len: Ipv6PrefixLen(64), }; + /// IPv6 multicast address range, `ff00::/8`. + pub const MCAST: Self = Self { + ip: Ipv6Addr::from_const([0xff00, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(8), + }; + /// IPv6 admin-local multicast scope prefix, `ff04::/16`. pub const MCAST_ADMIN_LOCAL: Self = Self { ip: Ipv6Addr::from_const([0xff04, 0, 0, 0, 0, 0, 0, 0]), prefix_len: Ipv6PrefixLen(16), }; - /// IPv6 site-local multicast scope prefix, `ff05::/16`. - pub const MCAST_SITE_LOCAL: Self = Self { - ip: Ipv6Addr::from_const([0xff05, 0, 0, 0, 0, 0, 0, 0]), - prefix_len: Ipv6PrefixLen(16), - }; - - /// IPv6 organization-local multicast scope prefix, `ff08::/16`. - pub const MCAST_ORG_LOCAL: Self = Self { - ip: Ipv6Addr::from_const([0xff08, 0, 0, 0, 0, 0, 0, 0]), - prefix_len: Ipv6PrefixLen(16), - }; - pub fn new(ip: Ipv6Addr, prefix_len: Ipv6PrefixLen) -> Self { let ip = ip.safe_mask(prefix_len); Ipv6Cidr { ip, prefix_len } @@ -1525,10 +1521,13 @@ mod test { #[test] fn test_ipv6_admin_scoped_multicast() { - // Test the three valid administrative scopes - assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); // admin-local (0x4) - assert!(to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local (0x5) - assert!(to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local (0x8) + // Test ff04::/16 prefix (admin-local scope used by Omicron) + assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); + assert!(to_ipv6("ff04:1234:5678:9abc::1").is_admin_scoped_multicast()); + + // Test other administrative scopes (NOT accepted) + assert!(!to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local + assert!(!to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local // Test non-admin scoped multicast addresses assert!(!to_ipv6("ff01::1").is_admin_scoped_multicast()); // interface-local diff --git a/crates/opte-api/src/mac.rs b/crates/opte-api/src/mac.rs index 728774de..1134ae6d 100644 --- a/crates/opte-api/src/mac.rs +++ b/crates/opte-api/src/mac.rs @@ -56,8 +56,16 @@ impl MacAddr { Self { inner: bytes } } - /// Return whether this MAC address is broadcast/multicast. - pub const fn is_broadcast(&self) -> bool { + /// Return whether this MAC address is a group address (I/G bit set). + /// + /// Per IEEE 802, the I/G (Individual/Group) bit is the LSB of the first octet. + /// When set to 1, the address is a group address, which includes both + /// multicast and broadcast (FF:FF:FF:FF:FF:FF) addresses. + /// + /// See [RFC 7042 §2.1] for details on IEEE 802 MAC address structure. + /// + /// [RFC 7042 §2.1]: https://www.rfc-editor.org/rfc/rfc7042#section-2.1 + pub const fn is_group(&self) -> bool { (self.inner[0] & 0b0000_0001) != 0 } } diff --git a/dtrace/README.adoc b/dtrace/README.adoc index 400d1f44..237e67c1 100644 --- a/dtrace/README.adoc +++ b/dtrace/README.adoc @@ -64,7 +64,15 @@ a|`opte-rule-match.d` a|`opte-tcp-flow-state.d` |Track the TCP flow state changes as they happen. Printing the state - transition as well as the flow ID. +transition as well as the flow ID. + +a|`opte-mcast-delivery.d` +|Track multicast TX/RX, local same-sled delivery, underlay forwarding, and + external forwarding. Also tracks multicast control-plane operations (map + set/clear, fwd set/clear, subscribe/unsubscribe, and dumps) to help correlate + config changes with dataplane events. Optional toggles are in the script's + BEGIN block: `flow_debug` (adds xde_mc_tx entry/return), `suppress_output` + (suppress per-event output), and `show_summary` (show aggregations at END). a|`opte-uft-invalidate.d` |Track Unified Flow Table invalidation as it happens. A UFT entry is diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d index 4924012a..8c6b8d83 100644 --- a/dtrace/opte-mcast-delivery.d +++ b/dtrace/opte-mcast-delivery.d @@ -1,97 +1,408 @@ /* - * Track multicast packet delivery. + * Track multicast packet delivery through OPTE/XDE. * - * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + * Usage: + * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + * + * Configuration (set in BEGIN block): + * suppress_output = 1 - Suppress per-event output, show only aggregations + * flow_debug = 1 - Enable multicast TX/RX function entry/exit tracing + * show_summary = 1 - Show aggregated summary at END (default: enabled) */ #include "common.h" -#define HDR_FMT "%-8s %-6s %-39s %-20s %-10s\n" -#define LINE_FMT "%-8s %-6d %-39s %-20s %-10s\n" +/* Local print formats (avoid colliding with common.h FLOW_FMT macros) */ +#define M_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define M_FWD_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_FWD_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define DBG_LINE_FMT "%-20s %-30s %s\n" + +/* Macro to reduce code duplication for group address formatting */ +#define MCAST_GROUP_STR(af, ptr) \ + ((af) == AF_INET ? inet_ntoa((ipaddr_t *)(ptr)) : \ + inet_ntoa6((in6_addr_t *)(ptr))) + +/* Configurable header reprint interval */ +#define HEADER_REPRINT_INTERVAL 10 + +/* + * OPTE command numbers for multicast-related ioctls (see crates/opte-api/src/cmd.rs). + */ +#define CMD_SET_MCAST_FWD 100 +#define CMD_CLEAR_MCAST_FWD 101 +#define CMD_DUMP_MCAST_FWD 102 +#define CMD_MCAST_SUBSCRIBE 103 +#define CMD_MCAST_UNSUBSCRIBE 104 +#define CMD_SET_M2P 105 +#define CMD_CLEAR_M2P 106 +#define CMD_DUMP_MCAST_SUBS 107 BEGIN { - printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + flow_debug = 0; /* Set to 1 to enable detailed flow debugging */ + suppress_output = 0; /* Set to 1 to suppress per-event output (aggregations only) */ + show_summary = 1; /* Set to 1 to show aggregated summary at END */ + num = 0; + + printf("OPTE Multicast Delivery Tracker\n"); + printf("Configuration:\n"); + printf(" flow_debug = %d\n", flow_debug); + printf(" suppress_output = %d\n", suppress_output); + printf(" show_summary = %d\n", show_summary); + printf("\n"); +} + +BEGIN +/!suppress_output/ +{ + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); +} + +/* Multicast TX function entry/exit (optional detailed debugging) */ +xde_mc_tx:entry +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-entry", "", ""); } -sdt:xde::mcast-tx { - /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ +xde_mc_tx:return +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-return", "", ""); +} + +mcast-tx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ this->af = arg0; this->group_ptr = arg1; this->vni = arg2; - this->repl = arg3; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["TX"] = count(); + @by_vni["TX", this->vni] = count(); + @by_group["TX", this->group_str] = count(); +} - if (num >= 10) { - printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); +mcast-tx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); num = 0; } - this->group_str = (this->af == AF_INET) ? - inet_ntoa((ipaddr_t *)this->group_ptr) : - inet_ntoa6((in6_addr_t *)this->group_ptr); - this->repl_str = (this->repl == 0) ? "External" : - (this->repl == 1) ? "Underlay" : - (this->repl == 2) ? "All" : "Unknown"; - printf(LINE_FMT, "TX", this->vni, this->group_str, "-", this->repl_str); + printf(M_LINE_FMT, "TX", this->vni, this->group_str, "-"); num++; } -sdt:xde::mcast-rx { - /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=replication */ +mcast-rx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ this->af = arg0; this->group_ptr = arg1; this->vni = arg2; - this->repl = arg3; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["RX"] = count(); + @by_vni["RX", this->vni] = count(); + @by_group["RX", this->group_str] = count(); +} - if (num >= 10) { - printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); +mcast-rx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); num = 0; } - this->group_str = (this->af == AF_INET) ? - inet_ntoa((ipaddr_t *)this->group_ptr) : - inet_ntoa6((in6_addr_t *)this->group_ptr); - this->repl_str = (this->repl == 0) ? "External" : - (this->repl == 1) ? "Underlay" : - (this->repl == 2) ? "All" : "Unknown"; - printf(LINE_FMT, "RX", this->vni, this->group_str, "-", this->repl_str); + printf(M_LINE_FMT, "RX", this->vni, this->group_str, "-"); num++; } -sdt:xde::mcast-local-delivery { +mcast-local-delivery { /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=port */ this->af = arg0; this->group_ptr = arg1; this->vni = arg2; this->port = stringof(arg3); + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); - if (num >= 10) { - printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + /* Always track aggregations (even when suppressing output) */ + @by_event["DELIVER"] = count(); + @by_vni["DELIVER", this->vni] = count(); + @by_port[this->port] = count(); + @by_group["DELIVER", this->group_str] = count(); +} + +mcast-local-delivery +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); num = 0; } - this->group_str = (this->af == AF_INET) ? - inet_ntoa((ipaddr_t *)this->group_ptr) : - inet_ntoa6((in6_addr_t *)this->group_ptr); - printf(LINE_FMT, "DELIVER", this->vni, this->group_str, this->port, "-"); + printf(M_LINE_FMT, "DELIVER", this->vni, this->group_str, this->port); num++; } -sdt:xde::mcast-underlay-fwd { - /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=next_hop */ +mcast-underlay-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ this->af = arg0; - this->group_ptr = arg1; + this->underlay_ptr = arg1; + this->vni = arg2; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); + + /* Always track aggregations (even when suppressing output) */ + @by_event["UNDERLAY"] = count(); + @by_vni["UNDERLAY", this->vni] = count(); + @by_underlay["UNDERLAY", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-underlay-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); + num = 0; + } + + printf(M_FWD_LINE_FMT, "UNDERLAY", this->vni, this->underlay_str, this->next_hop_str); + num++; +} + +mcast-external-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ + this->af = arg0; + this->underlay_ptr = arg1; this->vni = arg2; - this->next_hop = (in6_addr_t *)arg3; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); - if (num >= 10) { - printf(HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP", "REPL"); + /* Always track aggregations (even when suppressing output) */ + @by_event["EXTERNAL"] = count(); + @by_vni["EXTERNAL", this->vni] = count(); + @by_underlay["EXTERNAL", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-external-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); num = 0; } - this->group_str = (this->af == AF_INET) ? - inet_ntoa((ipaddr_t *)this->group_ptr) : - inet_ntoa6((in6_addr_t *)this->group_ptr); - this->next_hop_str = inet_ntoa6(this->next_hop); - printf(LINE_FMT, "UNDERLAY", this->vni, this->group_str, this->next_hop_str, "-"); + printf(M_FWD_LINE_FMT, "EXTERNAL", this->vni, this->underlay_str, this->next_hop_str); num++; } + +/* Control-plane config operations via ioctl */ +xde_ioc_opte_cmd:entry +{ + this->ioc = (opte_cmd_ioctl_t *)arg0; + this->cmd = this->ioc->cmd; + /* Only track multicast-related commands */ + this->name = + this->cmd == CMD_SET_M2P ? "CFG SET_M2P" : + this->cmd == CMD_CLEAR_M2P ? "CFG CLEAR_M2P" : + this->cmd == CMD_SET_MCAST_FWD ? "CFG SET_FWD" : + this->cmd == CMD_CLEAR_MCAST_FWD ? "CFG CLEAR_FWD" : + this->cmd == CMD_DUMP_MCAST_FWD ? "CFG DUMP_FWD" : + this->cmd == CMD_DUMP_MCAST_SUBS ? "CFG DUMP_SUBS" : + this->cmd == CMD_MCAST_SUBSCRIBE ? "CFG SUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE ? "CFG UNSUBSCRIBE" : + NULL; + + /* Always track aggregations for multicast ops */ + if (this->name != NULL) { + @cfg_counts[this->name] = count(); + } +} + +xde_ioc_opte_cmd:entry +/!suppress_output && this->name != NULL/ +{ + printf(DBG_LINE_FMT, this->name, "", ""); +} + +/* Dedicated control-plane probes (if present) */ +mcast-map-set { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_SET"] = count(); +} + +mcast-map-set +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-SET", this->vni, this->group, this->ul); +} + +mcast-map-clear { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_CLEAR"] = count(); +} + +mcast-map-clear +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-CLEAR", this->vni, this->group, this->ul); +} + +mcast-fwd-set { + /* arg0=underlay_ptr, arg1=count, arg2=vni */ + this->underlay = (in6_addr_t *)arg0; + this->count = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["FWD_SET"] = count(); +} + +mcast-fwd-set +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-SET", this->vni, "-", this->ul); +} + +mcast-fwd-clear { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @cfg_counts["FWD_CLEAR"] = count(); +} + +mcast-fwd-clear +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-CLEAR", this->vni, "-", this->ul); +} + +mcast-subscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["SUBSCRIBE"] = count(); +} + +mcast-subscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "SUBSCRIBE", this->vni, this->group, this->port); +} + +mcast-unsubscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["UNSUBSCRIBE"] = count(); +} + +mcast-unsubscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUBSCR", this->vni, this->group, this->port); +} + +/* Dataplane failure probes */ +mcast-tx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["TX_FAIL"] = count(); +} + +mcast-tx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "TX_FAIL", 0, "-", "-"); +} + +mcast-rx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["RX_FAIL"] = count(); +} + +mcast-rx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "RX_FAIL", 0, "-", "-"); +} + +mcast-no-fwd-entry { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @by_event["NOFWD"] = count(); +} + +mcast-no-fwd-entry +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "NOFWD", this->vni, "-", this->ul); +} + +/* Print aggregated summary when the script ends (if enabled) */ +END +/show_summary/ +{ + printf("\nSummary by event:\n"); + printa(@by_event); + printf("\nSummary by event and VNI:\n"); + printa(@by_vni); + printf("\nSummary by overlay group (TX/RX/DELIVER):\n"); + printa(@by_group); + printf("\nSummary by underlay multicast address (UNDERLAY/EXTERNAL):\n"); + printa(@by_underlay); + printf("\nLocal delivery by port:\n"); + printa(@by_port); + printf("\nForwarding by unicast next-hop (routing address):\n"); + printa(@by_nexthop_unicast); + printf("\nConfig ops:\n"); + printa(@cfg_counts); +} diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index 26fd831f..0adb9935 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -37,6 +37,7 @@ use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; @@ -265,6 +266,12 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) } + /// Dump the multicast subscription table (group -> ports on this sled). + pub fn dump_mcast_subs(&self) -> Result { + let cmd = OpteCmd::DumpMcastSubscriptions; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + /// Subscribe a port to a multicast group. pub fn mcast_subscribe( &self, diff --git a/lib/opte-test-utils/Cargo.toml b/lib/opte-test-utils/Cargo.toml index 2236b8a8..0163aa46 100644 --- a/lib/opte-test-utils/Cargo.toml +++ b/lib/opte-test-utils/Cargo.toml @@ -10,6 +10,7 @@ repository.workspace = true usdt = ["oxide-vpc/usdt"] [dependencies] +anyhow.workspace = true opte = { workspace = true, features = ["std"] } oxide-vpc = { workspace = true, features = ["engine", "std", "test-help"] } pcap-parser = { workspace = true, features = ["serialize"] } diff --git a/lib/opte-test-utils/src/geneve_verify.rs b/lib/opte-test-utils/src/geneve_verify.rs index 9a510548..8d193228 100644 --- a/lib/opte-test-utils/src/geneve_verify.rs +++ b/lib/opte-test-utils/src/geneve_verify.rs @@ -9,6 +9,9 @@ //! This uses the existing OPTE/ingot Geneve types to parse raw packet bytes //! and extract key multicast-related fields for test assertions. +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; use opte::engine::geneve::Vni; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::parse::ValidGeneveOverV6; @@ -30,9 +33,9 @@ pub struct GeneveInfo { /// /// Returns VNI, outer IPv6 destination, and replication mode from Geneve /// options. -pub fn parse_geneve_packet(bytes: &[u8]) -> Result { +pub fn parse_geneve_packet(bytes: &[u8]) -> Result { let (pkt, _, _) = ValidGeneveOverV6::parse(bytes) - .map_err(|e| format!("Failed to parse Geneve/IPv6 packet: {e:?}"))?; + .context("Failed to parse Geneve/IPv6 packet")?; let vni = pkt.outer_encap.vni(); let outer_ipv6_dst = pkt.outer_v6.destination(); @@ -45,15 +48,14 @@ pub fn parse_geneve_packet(bytes: &[u8]) -> Result { /// /// Snoop output with `-x0` flag is hex digits without separators: /// "ffffffffffff001122334455..." -pub fn parse_snoop_hex(hex_str: &str) -> Result, String> { +pub fn parse_snoop_hex(hex_str: &str) -> Result> { hex_str .as_bytes() .chunks(2) .map(|chunk| { - let hex_byte = std::str::from_utf8(chunk) - .map_err(|e| format!("Invalid UTF-8: {e}"))?; - u8::from_str_radix(hex_byte, 16) - .map_err(|e| format!("Invalid hex: {e}")) + let hex_byte = + std::str::from_utf8(chunk).context("Invalid UTF-8")?; + u8::from_str_radix(hex_byte, 16).context("Invalid hex") }) .collect() } @@ -68,7 +70,7 @@ pub fn parse_snoop_hex(hex_str: &str) -> Result, String> { /// To avoid false positives from summary lines (e.g., "UDP port 6081"), the /// tokenized fallback triggers only for lines that look like offset-prefixed /// hex dumps. -pub fn extract_snoop_hex(snoop_output: &str) -> Result { +pub fn extract_snoop_hex(snoop_output: &str) -> Result { let mut hex_bytes = String::new(); for line in snoop_output.lines() { @@ -123,7 +125,7 @@ pub fn extract_snoop_hex(snoop_output: &str) -> Result { } if hex_bytes.is_empty() { - return Err("No hex data found in snoop output".to_string()); + bail!("No hex data found in snoop output"); } // Ensure even number of nibbles to form complete bytes. diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index efbf2a0d..3fc8cd2d 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -85,7 +85,7 @@ pub use oxide_vpc::engine::gateway; pub use oxide_vpc::engine::geneve::OxideOptionType; pub use oxide_vpc::engine::nat; pub use oxide_vpc::engine::overlay; -pub use oxide_vpc::engine::overlay::PerVniMaps; +pub use oxide_vpc::engine::overlay::Mcast2Phys; pub use oxide_vpc::engine::overlay::TUNNEL_ENDPOINT_MAC; pub use oxide_vpc::engine::overlay::Virt2Boundary; pub use oxide_vpc::engine::overlay::Virt2Phys; @@ -255,7 +255,8 @@ fn oxide_net_builder( name: &str, cfg: &oxide_vpc::cfg::VpcCfg, vpc_map: Arc, - vni_state: Arc, + v2p: Arc, + m2p: Arc, v2b: Arc, ) -> PortBuilder { #[allow(clippy::arc_with_non_send_sync)] @@ -270,11 +271,11 @@ fn oxide_net_builder( let dhcp = base_dhcp_config(); firewall::setup(&mut pb, fw_limit).expect("failed to add firewall layer"); - gateway::setup(&pb, cfg, vpc_map.clone(), fw_limit, &dhcp) + gateway::setup(&pb, cfg, vpc_map, fw_limit, &dhcp) .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, vni_state, vpc_map.clone(), v2b, one_limit) + overlay::setup(&pb, cfg, v2p, m2p, v2b, one_limit) .expect("failed to add overlay layer"); pb } @@ -283,6 +284,7 @@ pub struct PortAndVps { pub port: Port, pub vps: VpcPortState, pub vpc_map: Arc, + pub m2p: Arc, pub cfg: oxide_vpc::cfg::VpcCfg, } @@ -348,6 +350,7 @@ pub fn oxide_net_setup2( let vpc_net = VpcNetwork { cfg: converted_cfg.clone() }; let uft_limit = flow_table_limits.unwrap_or(UFT_LIMIT.unwrap()); let tcp_limit = flow_table_limits.unwrap_or(TCP_LIMIT.unwrap()); + let m2p = Arc::new(Mcast2Phys::new()); let v2b = Arc::new(Virt2Boundary::new()); v2b.set( "0.0.0.0/0".parse().unwrap(), @@ -364,10 +367,16 @@ pub fn oxide_net_setup2( }], ); - let port = - oxide_net_builder(name, &converted_cfg, vpc_map.clone(), port_v2p, v2b) - .create(vpc_net, uft_limit, tcp_limit) - .unwrap(); + let port = oxide_net_builder( + name, + &converted_cfg, + vpc_map.clone(), + port_v2p, + m2p.clone(), + v2b, + ) + .create(vpc_net, uft_limit, tcp_limit) + .unwrap(); // Add router entry that allows the guest to send to other guests // on same subnet. @@ -380,7 +389,7 @@ pub fn oxide_net_setup2( .unwrap(); let vps = VpcPortState::new(); - let mut pav = PortAndVps { port, vps, vpc_map, cfg: converted_cfg }; + let mut pav = PortAndVps { port, vps, vpc_map, m2p, cfg: converted_cfg }; let mut updates = vec![ // * Epoch starts at 1, adding router entry bumps it to 2. @@ -435,11 +444,14 @@ pub fn oxide_net_setup2( }); updates.extend_from_slice(&[ + // * IPv4 multicast passthrough + // * IPv6 multicast passthrough // * Allow guest to route to own subnet - "set:router.rules.out=1", + "set:router.rules.out=3", // * Outbound encap // * Inbound decap - "set:overlay.rules.in=1, overlay.rules.out=1", + // * Inbound mcast-vni-validator + "set:overlay.rules.in=2, overlay.rules.out=1", ]); if let Some(val) = custom_updates { diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 97f19242..02854ef6 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -214,41 +214,68 @@ all data needed to restart the port without rebuilding the entire flow state. This is achieved by restoring the port based on some payload of save data. -=== Multicast Model +=== Multicast OPTE implements multicast consistent with the rack networking architecture described in [RFD 63](https://rfd.shared.oxide.computer/rfd/0063) -and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). Key points: +and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). -Fleet VNI:: All multicast traffic uses a single fleet‑level Geneve VNI +==== Fleet VNI + +All multicast traffic uses a single fleet‑level Geneve VNI (`DEFAULT_MULTICAST_VNI`, currently `77`) rather than per‑tenant VNIs. Mappings from overlay multicast groups to underlay multicast addresses are stored and validated under this VNI. (See `RFD 488` for the rationale behind fleet-level VNI.) -Delivery Modes (Replication):: The Oxide Geneve multicast option carries -the delivery mode as a 2‑bit field in the top two bits of the option -body's first byte: +==== Delivery Modes and Replication + +The `Replication` type is a TX‑only instruction to switches encoded in the Oxide Geneve +multicast option as a 2‑bit field in the top two bits of the option body's first byte. +It tells the switch which ports to replicate the frame to on transmission. On RX, OPTE +ignores the replication field and performs local same‑sled delivery based purely on +subscriptions. The replication mode is not an access control mechanism. + +OPTE always performs local same‑sled delivery for all replication modes and acts as a leaf: + +* _External_ replicates to ports set for external multicast traffic. Switch decaps + and replicates to front panel ports (egress to external networks, leaving the underlay). + OPTE does not create additional multicast copies for other sleds. +* _Underlay_ replicates to ports set for underlay multicast traffic. Switch replicates + to other sleds (using the underlay). The underlay network performs further replication + within the rack. +* _Both_ replicates to both port groups (bifurcated). Combines `External` and `Underlay`: + switch replicates to both front panel and underlay ports. + +For all replication modes, OPTE routes to the next hop's unicast address to determine +reachability and underlay port/MAC. The packet destination (outer IPv6) is the multicast +address from M2P with multicast MAC (RFC 2464). All multicast uses fleet VNI 77. + +==== Encapsulation Path + +On TX, the overlay layer encapsulates packets destined for multicast groups +with a Geneve multicast option initially set to `External` replication mode. +XDE's multicast TX path (`xde_mc_tx`) first delivers the packet locally to +all other ports on the same sled that have subscribed to the multicast group +(within the same VNI), then consults the multicast forwarding table. + +For each next hop in the forwarding table, XDE creates a packet copy and updates its +Geneve multicast option to match that next hop's configured replication mode. +XDE routes to the next hop's unicast address (for all replication modes) to determine +reachability and which underlay port/MAC to use. The packet destination (outer IPv6) +is the multicast address from M2P with multicast MAC (RFC 2464). The Geneve replication +option serves as a TX-only instruction telling switches which port groups to replicate to. -* External — local guest delivery within the same VNI: OPTE decapsulates - and delivers to all local subscribers (guests) on the port map. -* Underlay — infrastructure delivery: OPTE sends Geneve‑encapsulated - packets towards the configured underlay multicast address in fleet - VNI 77. The underlay performs any further replication. -* All — both behaviors above. +==== RX Behavior -Encapsulation Path:: The overlay layer sets `External` in the multicast -option on initial encapsulation. XDE uses its multicast forwarding table -to decide whether to additionally forward to underlay next hops, and, if -so, marks those forwarded copies as `Underlay` or `All` to prevent -re‑relay at downstream receivers. +OPTE acts as a leaf node and does not relay multicast traffic received from the underlay. Constraints & Validation:: * M2P (multicast‑to‑physical) mappings must use `DEFAULT_MULTICAST_VNI`. * Any next hop that causes underlay forwarding must specify VNI 77. -* Underlay multicast addresses must be IPv6 admin‑scoped (e.g., - `ff04::/16`, `ff05::/16`, `ff08::/16`). +* Underlay multicast addresses must be IPv6 admin-local multicast (`ff04::/16`) + as defined in https://www.rfc-editor.org/rfc/rfc7346.html[RFC 7346]. === Layers diff --git a/lib/opte/src/ddi/sync.rs b/lib/opte/src/ddi/sync.rs index 70397094..817f8857 100644 --- a/lib/opte/src/ddi/sync.rs +++ b/lib/opte/src/ddi/sync.rs @@ -780,3 +780,29 @@ impl Drop for TokenGuard<'_, T> { assert_eq!(Some(curthread), lock_thread); } } + +/// Clone an Arc from behind a RwLock, dropping the read lock immediately. +/// +/// This pattern is used throughout the datapath to make readers lock-free +/// while keeping snapshots alive via Arc refcounting. The brief lock hold +/// (just the Arc clone) minimizes contention and avoids blocking management +/// operations like `refresh_maps()`. +#[inline(always)] +pub fn clone_from_rwlock( + lock: &KRwLock>, +) -> alloc::sync::Arc { + alloc::sync::Arc::clone(&*lock.read()) +} + +/// Clone an Arc from behind a Mutex, dropping the lock immediately. +/// +/// This pattern is used throughout the datapath to make readers lock-free +/// while keeping snapshots alive via Arc refcounting. The brief lock hold +/// (just the Arc clone) minimizes contention and avoids blocking management +/// operations like `refresh_maps()`. +#[inline(always)] +pub fn clone_from_mutex( + lock: &KMutex>, +) -> alloc::sync::Arc { + alloc::sync::Arc::clone(&*lock.lock()) +} diff --git a/lib/opte/src/engine/predicate.rs b/lib/opte/src/engine/predicate.rs index 551f2179..4527efe9 100644 --- a/lib/opte/src/engine/predicate.rs +++ b/lib/opte/src/engine/predicate.rs @@ -97,12 +97,15 @@ impl Display for EtherTypeMatch { #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub enum EtherAddrMatch { Exact(MacAddr), + /// Match any multicast/broadcast MAC address (LSB of first octet is 1). + Multicast, } impl EtherAddrMatch { fn matches(&self, flow_addr: MacAddr) -> bool { match self { EtherAddrMatch::Exact(addr) => flow_addr == *addr, + EtherAddrMatch::Multicast => flow_addr.is_group(), } } } @@ -113,6 +116,7 @@ impl Display for EtherAddrMatch { match self { Exact(addr) => write!(f, "{addr}"), + Multicast => write!(f, "multicast"), } } } diff --git a/lib/opte/src/lib.rs b/lib/opte/src/lib.rs index 6de57220..6c62d544 100644 --- a/lib/opte/src/lib.rs +++ b/lib/opte/src/lib.rs @@ -200,7 +200,7 @@ mod opte_provider { /// /// Logging levels are provided by [`LogLevel`]. These levels will map /// to the underlying provider with varying degrees of success. -pub trait LogProvider { +pub trait LogProvider: Send + Sync { /// Log a message at the specified level. fn log(&self, level: LogLevel, msg: &str); } diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index 14f443e8..ba1563ac 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,15 +20,26 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; -/// Multicast packet replication strategy. +/// TX-only instruction to switches for multicast packet replication. /// -/// Encoding and scope: -/// - The Geneve Oxide multicast option encodes replication in the top 2 bits -/// of the option body’s first byte (u2). The remaining 30 bits are reserved. -/// - External means local customer-facing delivery within the same VNI -/// - Underlay means Geneve-encapsulated forwarding to underlay infrastructure -/// members using the fleet multicast VNI. -/// - All combines both behaviors. +/// Tells the switch which port groups to replicate outbound multicast packets +/// to. It is a transmit-only setting - on RX, OPTE ignores the replication +/// field and performs local same-sled delivery based purely on subscriptions. +/// The replication mode is not an access control mechanism. +/// +/// Routing vs replication: OPTE routes to the [`NextHopV6::addr`] (switch's +/// unicast address) for all modes to determine reachability and which underlay +/// port/MAC to use. +/// +/// The packet destination (outer IPv6) is the multicast address from M2P. This +/// [`Replication`] value tells the switch which port groups to replicate to. +/// +/// - `External`: Switch decaps and replicates to external-facing ports only +/// - `Underlay`: Switch replicates to underlay ports (other sleds) only +/// - `Both`: Switch replicates to both external and underlay ports (bifurcated) +/// +/// Encoding: The Geneve Oxide multicast option encodes the replication strategy in the +/// top 2 bits of the option body's first byte (u2). The remaining 30 bits are reserved. /// /// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) /// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). @@ -37,48 +48,25 @@ use uuid::Uuid; )] #[repr(u8)] pub enum Replication { - /// Replicate packets to external/customer-facing members (guest instances). + /// Replicate packets to ports set for external multicast traffic. /// - /// Local delivery within the same VNI. Packets are decapsulated at the - /// switch before delivery to guests. + /// Switch decaps and replicates to front panel ports (egress to external + /// networks, leaving the underlay). #[default] External = 0x00, - /// Replicate packets to underlay/infrastructure members. + /// Replicate packets to ports set for underlay multicast traffic. /// - /// Forwards Geneve-encapsulated packets to underlay destinations for - /// infrastructure delivery (not directly to guest instances). Uses - /// DEFAULT_MULTICAST_VNI (77) for encapsulation. + /// Switch replicates to sleds (using the underlay). Underlay = 0x01, - /// Replicate packets to both external and underlay members (bifurcated). + /// Replicate packets to ports set for underlay and external multicast traffic (bifurcated). /// - /// Combines both customer-facing (decapsulated to guests) and infrastructure - /// (encapsulated) delivery modes for comprehensive multicast distribution. - All = 0x02, + /// Switch replicates to both front panel ports (egress to external networks) and sleds. + Both = 0x02, /// Reserved for future use. This value exists to account for all possible /// values in the 2-bit Geneve option field. Reserved = 0x03, } -impl Replication { - /// Merge two replication strategies, preferring the most permissive. - /// - /// Merging rules: - /// - Any `All` -> `All` - /// - `External` + `Underlay` -> `All` - /// - Same values -> keep the value - /// - Default to `All` for unexpected combinations - pub const fn merge(self, other: Self) -> Self { - match (self, other) { - (Self::All, _) | (_, Self::All) => Self::All, - (Self::External, Self::Underlay) - | (Self::Underlay, Self::External) => Self::All, - (a, b) if a as u8 == b as u8 => a, - // Prefer `All` for unexpected combinations - _ => Self::All, - } - } -} - #[cfg(any(feature = "std", test))] impl FromStr for Replication { type Err = String; @@ -87,9 +75,9 @@ impl FromStr for Replication { match s.to_ascii_lowercase().as_str() { "external" => Ok(Self::External), "underlay" => Ok(Self::Underlay), - "all" => Ok(Self::All), + "both" => Ok(Self::Both), lower => Err(format!( - "unexpected replication type {lower} -- expected 'external', 'underlay', or 'all'" + "unexpected replication {lower} -- expected 'external', 'underlay', or 'both'" )), } } @@ -109,10 +97,10 @@ pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; /// All multicast groups (M2P mappings and forwarding entries) must use this VNI. /// OPTE validates that multicast operations specify this VNI and rejects others. /// -/// **Security model:** While M2P (Multicast-to-Physical) mappings are stored +/// While M2P (Multicast-to-Physical) mappings are stored /// per-VNI in the code, the enforcement of DEFAULT_MULTICAST_VNI means all /// multicast traffic shares a single namespace across the rack, with no -/// VPC-level isolation (as multicast groups are fleet-wide). +/// VPC-level isolation (as multicast groups are fleet-wide) *as of now*. pub const DEFAULT_MULTICAST_VNI: u32 = 77u32; /// Description of Boundary Services, the endpoint used to route traffic @@ -392,13 +380,24 @@ pub struct PhysNet { } /// Represents an IPv6 next hop for multicast forwarding. +/// +/// OPTE routes to [`NextHopV6::addr`] (the switch's unicast address) for all +/// replication modes to determine reachability and which underlay port/MAC to +/// use. The packet destination (outer IPv6) is always the multicast address +/// from M2P. The associated [`Replication`] mode is a TX-only instruction +/// telling the switch which port groups to replicate to on transmission. +/// Routing is always to the unicast next hop. #[derive( Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, )] pub struct NextHopV6 { - /// The IPv6 address of the next hop + /// The unicast IPv6 address of the switch endpoint (for routing). + /// This determines which underlay port and source MAC to use. + /// The actual packet destination (outer IPv6) is the multicast address. pub addr: Ipv6Addr, - /// The VNI to use for this next hop + /// The VNI to use for Geneve encapsulation. + /// Currently must be DEFAULT_MULTICAST_VNI (77). + /// Future: could support per-VPC VNIs for multicast isolation. pub vni: Vni, } @@ -408,27 +407,6 @@ impl NextHopV6 { } } -/// A next hop for multicast forwarding (supports both IPv4 and IPv6). -#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] -pub struct NextHop { - /// The IP address of the next hop - pub addr: IpAddr, - /// The VNI to use for this next hop - pub vni: Vni, -} - -impl NextHop { - pub fn new(addr: IpAddr, vni: Vni) -> Self { - Self { addr, vni } - } -} - -impl From for NextHop { - fn from(v6: NextHopV6) -> Self { - Self { addr: v6.addr.into(), vni: v6.vni } - } -} - /// A Geneve tunnel endpoint. #[derive(Clone, Copy, Debug, Deserialize, Serialize)] pub struct TunnelEndpoint { @@ -492,18 +470,12 @@ impl From for GuestPhysAddr { /// abstraction, it's simply allowing one subnet to talk to another. /// There is no separate VPC router process, the real routing is done /// by the underlay. -/// -/// * Multicast: Packets matching this entry are multicast traffic. -/// Uses the M2P (Multicast-to-Physical) mapping to determine underlay -/// destinations. Does not apply SNAT; the outer IPv6 underlay source -/// is the physical IP. #[derive(Clone, Debug, Copy, Deserialize, Serialize)] pub enum RouterTarget { Drop, InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), - Multicast(IpCidr), } #[cfg(any(feature = "std", test))] @@ -535,15 +507,6 @@ impl FromStr for RouterTarget { cidr6s.parse().map(|x| Self::VpcSubnet(IpCidr::Ip6(x))) } - Some(("mcast4", cidr4s)) => { - let cidr4 = cidr4s.parse()?; - Ok(Self::Multicast(IpCidr::Ip4(cidr4))) - } - - Some(("mcast6", cidr6s)) => { - cidr6s.parse().map(|x| Self::Multicast(IpCidr::Ip6(x))) - } - Some(("ig", uuid)) => Ok(Self::InternetGateway(Some( uuid.parse::().map_err(|e| e.to_string())?, ))), @@ -564,12 +527,6 @@ impl Display for RouterTarget { Self::Ip(IpAddr::Ip6(ip6)) => write!(f, "ip6={ip6}"), Self::VpcSubnet(IpCidr::Ip4(sub4)) => write!(f, "sub4={sub4}"), Self::VpcSubnet(IpCidr::Ip6(sub6)) => write!(f, "sub6={sub6}"), - Self::Multicast(IpCidr::Ip4(mcast4)) => { - write!(f, "mcast4={mcast4}") - } - Self::Multicast(IpCidr::Ip6(mcast6)) => { - write!(f, "mcast6={mcast6}") - } } } } @@ -676,8 +633,6 @@ pub struct VpcMapResp { pub vni: Vni, pub ip4: Vec<(Ipv4Addr, GuestPhysAddr)>, pub ip6: Vec<(Ipv6Addr, GuestPhysAddr)>, - pub mcast_ip4: Vec<(Ipv4Addr, Ipv6Addr)>, - pub mcast_ip6: Vec<(Ipv6Addr, Ipv6Addr)>, } #[derive(Debug, Deserialize, Serialize)] @@ -714,26 +669,36 @@ pub struct ClearVirt2PhysReq { pub phys: PhysNet, } -/// Set mapping from multicast group to underlay multicast address. +/// Set mapping from (overlay) multicast group to underlay multicast address. +/// +/// Creates a multicast group fleet-wide by mapping an overlay multicast address +/// to an underlay IPv6 multicast address. Ports can then join via `subscribe()`. +/// The M2P mapping is the source of truth - if it exists, the group exists. +/// +/// Ports join and leave with `subscribe()` and `unsubscribe()`, which look up +/// the underlay address via this M2P mapping. Without the mapping, `subscribe()` +/// fails (can't look up underlay), but `unsubscribe()` succeeds +/// (group gone => not subscribed). +/// +/// This handles cleanup races where the control plane deletes the group before +/// sleds finish unsubscribing ports. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetMcast2PhysReq { /// Overlay multicast group address pub group: IpAddr, /// Underlay IPv6 multicast address pub underlay: Ipv6Addr, - /// VNI for this mapping - pub vni: Vni, } /// Clear a mapping from multicast group to underlay multicast address. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct ClearMcast2PhysReq { /// Overlay multicast group address pub group: IpAddr, /// Underlay IPv6 multicast address pub underlay: Ipv6Addr, - /// VNI for this mapping - pub vni: Vni, } /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. @@ -776,20 +741,36 @@ pub enum DelRouterEntryResp { NotFound, } -/// Set multicast forwarding entries for a multicast group. +/// Set multicast forwarding entries for an underlay multicast group. +/// +/// Configures how OPTE forwards multicast packets for a specific underlay group. +/// The forwarding table maps underlay multicast addresses to switch endpoints +/// and TX-only replication instructions. +/// +/// Routing vs destination: OPTE routes to [`NextHopV6::addr`] (switch's unicast +/// address) to determine reachability and which underlay port/MAC to use. The +/// packet is sent to the multicast address (`underlay`) with multicast MAC. The +/// switch uses the multicast destination and Geneve [`Replication`] tag +/// to determine which port groups to replicate to on transmission. +/// +/// Fleet-wide multicast: All multicast uses DEFAULT_MULTICAST_VNI (77) +/// currently. The VNI in NextHopV6 must be 77 - other values are rejected. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetMcastForwardingReq { - /// The multicast group address (overlay) - pub group: IpAddr, - /// The next hops (underlay IPv6 addresses) with replication information + /// The underlay IPv6 multicast address (outer IPv6 dst in transmitted packets) + pub underlay: Ipv6Addr, + /// Switch endpoints and TX-only replication instructions. + /// Each NextHopV6.addr is the unicast IPv6 of a switch (for routing). + /// The Replication is a TX-only instruction indicating which port groups + /// the switch should use. pub next_hops: Vec<(NextHopV6, Replication)>, } -/// Clear multicast forwarding entries for a multicast group. +/// Clear multicast forwarding entries for an underlay multicast group. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct ClearMcastForwardingReq { - /// The multicast group address - pub group: IpAddr, + /// The underlay IPv6 multicast address + pub underlay: Ipv6Addr, } /// Response for dumping the multicast forwarding table. @@ -804,14 +785,31 @@ impl CmdOk for DumpMcastForwardingResp {} /// A single multicast forwarding table entry. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct McastForwardingEntry { - /// The multicast group address (overlay) - pub group: IpAddr, - /// The next hops (underlay IPv6 addresses) with replication information + /// The underlay IPv6 multicast address + pub underlay: Ipv6Addr, + /// The next hops (underlay IPv6 addresses) with TX-only replication instructions pub next_hops: Vec<(NextHopV6, Replication)>, } impl opte::api::cmd::CmdOk for DelRouterEntryResp {} +/// Response for dumping the multicast subscription table (group -> ports). +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastSubscriptionsResp { + pub entries: Vec, +} + +impl CmdOk for DumpMcastSubscriptionsResp {} + +/// A single multicast subscription entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscriptionEntry { + /// The underlay IPv6 multicast address (subscription key) + pub underlay: Ipv6Addr, + /// Port names subscribed to this group on this sled + pub ports: Vec, +} + /// Subscribe a port to a multicast group. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct McastSubscribeReq { diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index 74ff34bc..b6f8504a 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -198,26 +198,38 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); - // Multicast prefixes (224.0.0.0/4) + // IPv4 multicast prefixes (224.0.0.0/4) let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; - // Outbound multicast - allow from guest's MAC to multicast destinations - let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); - mcast_out.add_predicate(Predicate::InnerDstIp4(ipv4_mcast.clone())); - mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ + // Outbound IPv4 multicast - allow from guest's MAC to multicast destinations. + // + // NOTE: This unconditionally allows any dst IP in 224.0.0.0/4 (all IPv4 multicast). + // The overlay layer enforces M2P mappings and underlay address validation. + // + // Because these gateway rules are unconditional (no destination filtering), + // custom firewall routes can target ANY IP range to a multicast group, + // enabling intra-VPC use cases. + let mut mcast_out_v4 = Rule::new(1001, Action::Meta(vpc_meta.clone())); + mcast_out_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast.clone())); + mcast_out_v4.add_predicate(Predicate::InnerEtherSrc(vec![ EtherAddrMatch::Exact(cfg.guest_mac), ])); - layer.add_rule(Direction::Out, mcast_out.finalize()); + layer.add_rule(Direction::Out, mcast_out_v4.finalize()); - // Inbound multicast - allow multicast destinations to guest - let mut mcast_in = Rule::new( + // Inbound IPv4 multicast - rewrite source MAC to gateway and allow + // This mirrors the IPv6 multicast inbound rule to ensure multicast + // delivery to guests is permitted by the gateway layer. + let mut mcast_in_v4 = Rule::new( 1001, Action::Static(Arc::new(RewriteSrcMac { gateway_mac: cfg.gateway_mac, })), ); - mcast_in.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); - layer.add_rule(Direction::In, mcast_in.finalize()); + mcast_in_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); + mcast_in_v4.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); + layer.add_rule(Direction::In, mcast_in_v4.finalize()); Ok(()) } @@ -255,30 +267,38 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); - // Admin-/site-/org-scoped multicast prefixes (for underlay forwarding) - let admin_mcast_prefixes = vec![ - Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ADMIN_LOCAL), - Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_SITE_LOCAL), - Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST_ORG_LOCAL), - ]; + // IPv6 multicast prefix (ff00::/8) + // Allow any overlay multicast address - the underlay (ff04::/16) restriction + // is enforced by M2P mappings and multicast forwarding validation, not here. + let ipv6_mcast = vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]; - // Outbound multicast - allow from guest's MAC to multicast destinations + // Outbound multicast - allow from guest's MAC to multicast destinations. + // + // NOTE: This unconditionally allows any dst IP in ff00::/8 (all IPv6 multicast). + // The overlay layer enforces M2P mappings, and only ff04::/16 underlay addresses + // are permitted by the M2P validation. + // + // Because these gateway rules are unconditional (no destination filtering), + // custom firewall routes can target ANY IP range to a multicast group, + // enabling intra-VPC use cases. let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); - mcast_out - .add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes.clone())); + mcast_out.add_predicate(Predicate::InnerDstIp6(ipv6_mcast.clone())); mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ EtherAddrMatch::Exact(cfg.guest_mac), ])); layer.add_rule(Direction::Out, mcast_out.finalize()); - // Inbound multicast - allow multicast destinations to guest + // Inbound multicast - rewrite source MAC to gateway let mut mcast_in = Rule::new( 1001, Action::Static(Arc::new(RewriteSrcMac { gateway_mac: cfg.gateway_mac, })), ); - mcast_in.add_predicate(Predicate::InnerDstIp6(admin_mcast_prefixes)); + mcast_in.add_predicate(Predicate::InnerDstIp6(ipv6_mcast)); + mcast_in.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); layer.add_rule(Direction::In, mcast_in.finalize()); Ok(()) diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index 0cb18be6..08f90812 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -16,7 +16,6 @@ //! //! - **External** (0x00): Indicates a packet originated from outside the rack //! and was encapsulated by the switch NAT ingress path with Geneve wrapping. -//! OPTE decapsulates before delivering to the guest. //! - **Multicast** (0x01): Carries multicast replication strategy as a 2-bit //! field for coordinating delivery between OPTE and sidecar switch logic. //! - **Mss** (0x02): Carries original TCP MSS for MSS clamping/boosting to @@ -35,28 +34,35 @@ //! └──────────┴────────────────────────────┘ //! │ //! └─> Replication mode: -//! 00 = External (local guest delivery) -//! 01 = Underlay (infrastructure forwarding) -//! 10 = All (both External and Underlay) +//! 00 = External (front panel/customer ports, traffic leaving rack) +//! 01 = Underlay (infrastructure forwarding to other sleds) +//! 10 = Both (both External and Underlay) //! 11 = Reserved //! ``` //! -//! ### Replication Semantics +//! ### Replication Semantics (TX-only instruction) //! -//! - **External**: Packet should be decapsulated and delivered to local guest -//! instances subscribed to this multicast group. Switch sets `nat_egress_hit` -//! to trigger decapsulation before delivery. -//! - **Underlay**: Packet should remain encapsulated and forwarded to underlay -//! infrastructure destinations. -//! - **All**: Bifurcated delivery to both local guests (decapsulated) and -//! underlay destinations (encapsulated). +//! The [`Replication`] type is a TX-only instruction telling the switch which port groups +//! to replicate outbound multicast packets to. On RX, OPTE ignores the replication field +//! and performs local same-sled delivery based purely on subscriptions. +//! +//! OPTE routes to next hop unicast address (for ALL modes) to determine reachability +//! and underlay port/MAC. Packet destination is multicast ff04::/16 with multicast MAC. +//! +//! - **External**: Switch decaps and replicates to external-facing ports (front panel) +//! - **Underlay**: Switch replicates to underlay ports (other sleds) +//! - **Both**: Switch replicates to both external and underlay port groups (bifurcated) +//! - **Local same-sled delivery**: Always happens regardless of the TX-only replication setting. +//! Not an access control mechanism - local delivery is independent of replication mode. //! //! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) //! regardless of replication mode. The replication mode determines delivery behavior, //! not VNI selection. //! -//! The 2-bit encoding allows efficient extraction in P4 programs without complex -//! parsing, aligning with the sidecar pipeline's tag-based routing decisions. +//! The 2-bit encoding allows extraction in P4 programs and aligns with the +//! sidecar pipeline's tag-based routing decisions. +//! +//! [`Replication`]: crate::api::Replication //! //! ## Option Length Encoding //! @@ -149,21 +155,12 @@ impl<'a> OptionCast<'a> for ValidOxideOption<'a> { } } -/// Geneve multicast option body carrying replication strategy information. -/// -/// This option encodes the replication scope as a 2-bit field in the top two -/// bits of the first byte of the option body. The remaining 30 bits are -/// reserved for future use. The replication strategy determines whether the -/// packet is delivered to local guest instances (External), underlay -/// infrastructure destinations (Underlay), or both (All). +/// Geneve multicast option body carrying replication information. #[derive(Debug, Clone, Ingot, Eq, PartialEq)] #[ingot(impl_default)] pub struct MulticastInfo { - /// Replication scope encoded as a u2 (top 2 bits of the first byte). - /// Values map to `Replication::{External, Underlay, All, Reserved}`. #[ingot(is = "u2")] pub version: Replication, - /// Reserved bits (remaining 30 bits of the body). rsvd: u30be, } @@ -177,7 +174,7 @@ impl NetworkRepr for Replication { match val { 0 => Replication::External, 1 => Replication::Underlay, - 2 => Replication::All, + 2 => Replication::Both, 3 => Replication::Reserved, _ => unreachable!("u2 value out of range: {val}"), } @@ -219,17 +216,24 @@ pub fn validate_options( } /// Extract multicast replication info from Geneve options. -/// Returns None if no multicast option is present, or Some(Replication) if found. /// -/// Treats Reserved (value 3) as invalid and returns None, implementing fail-closed -/// behavior without crashing the parser. +/// Treats Reserved (value 3) as invalid and returns None, implementing +/// fail-closed behavior. /// -/// Note: This function silently skips options with parse errors (e.g., TooSmall). -/// Call `validate_options()` first if you want parse errors surfaced instead of -/// being silently ignored. +/// This function silently skips options with parse errors (e.g., `TooSmall`). +/// Call `validate_options()` first if you want parse errors surfaced and +/// RFC 8926 critical option semantics enforced. This function assumes +/// validation has already been performed. pub fn extract_multicast_replication( pkt: &ValidGeneve, ) -> Option { + // In debug builds, verify validate_options() was called first if critical options present + debug_assert!( + !pkt.flags().contains(GeneveFlags::CRITICAL_OPTS) + || validate_options(pkt).is_ok(), + "extract_multicast_replication() called without prior validation when critical options present" + ); + for opt in OxideOptions::from_raw(pkt) { let Ok(opt) = opt else { continue }; if let Some(ValidOxideOption::Multicast(mc_info)) = opt.option.known() { @@ -269,6 +273,10 @@ mod test { use ingot::types::HeaderParse; use ingot::udp::ValidUdp; + /// Critical bit mask for Geneve option type field (bit 7). + /// Per RFC 8926, unknown options with this bit set must cause packet drop. + const GENEVE_OPT_TYPE_CRITICAL: u8 = 0x80; + #[test] fn parse_single_opt() { // Create a packet with one extension header. @@ -346,7 +354,7 @@ mod test { for (rep, expect) in [ (Replication::External, Replication::External), (Replication::Underlay, Replication::Underlay), - (Replication::All, Replication::All), + (Replication::Both, Replication::Both), ] { let buf = build_buf(rep); let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); @@ -384,7 +392,7 @@ mod test { // experimenter option class 0xff, 0xff, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; @@ -422,7 +430,7 @@ mod test { // experimenter option class 0x01, 0x29, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 111ccdf9..ea81d9e9 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -71,6 +71,8 @@ use opte::engine::rule::GenHtError; use opte::engine::rule::GenHtResult; use opte::engine::rule::HdrTransform; use opte::engine::rule::MappingResource; +use opte::engine::rule::MetaAction; +use opte::engine::rule::ModMetaResult; use opte::engine::rule::Resource; use opte::engine::rule::ResourceEntry; use opte::engine::rule::Rule; @@ -82,8 +84,8 @@ pub const OVERLAY_LAYER_NAME: &str = "overlay"; pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, - vni_state: Arc, - vpc_map: Arc, + v2p: Arc, + m2p: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -91,26 +93,39 @@ pub fn setup( let encap = Action::Static(Arc::new(EncapAction::new( cfg.phys_ip, cfg.vni, - vni_state, - vpc_map, + v2p, + m2p, v2b, ))); // Action Index 1 let decap = Action::Static(Arc::new(DecapAction::new())); + // Action Index 2 - Multicast VNI validator + let vni_validator = + Action::Meta(Arc::new(MulticastVniValidator::new(cfg.vni))); + let actions = LayerActions { - actions: vec![encap, decap], + actions: vec![encap, decap, vni_validator], default_in: DefaultAction::Deny, default_out: DefaultAction::Deny, }; let mut layer = Layer::new(OVERLAY_LAYER_NAME, pb.name(), actions, ft_limit); + + // Outbound: encapsulation (priority 1) let encap_rule = Rule::match_any(1, layer.action(0).unwrap()); layer.add_rule(Direction::Out, encap_rule); + + // Inbound: decapsulation (priority 1 - runs first, sets ACTION_META_VNI) let decap_rule = Rule::match_any(1, layer.action(1).unwrap()); layer.add_rule(Direction::In, decap_rule); + + // Inbound: VNI validation (priority 2 - runs after decap) + let vni_check_rule = Rule::match_any(2, layer.action(2).unwrap()); + layer.add_rule(Direction::In, vni_check_rule); + // NOTE The First/Last positions cannot fail; perhaps I should // improve the API to avoid the unwrap(). pb.add_layer(layer, Pos::Last) @@ -186,8 +201,8 @@ pub struct EncapAction { // sending data. phys_ip_src: Ipv6Addr, vni: Vni, - vni_state: Arc, - vpc_map: Arc, + v2p: Arc, + m2p: Arc, v2b: Arc, } @@ -195,11 +210,11 @@ impl EncapAction { pub fn new( phys_ip_src: Ipv6Addr, vni: Vni, - vni_state: Arc, - vpc_map: Arc, + v2p: Arc, + m2p: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, vni_state, vpc_map, v2b } + Self { phys_ip_src, vni, v2p, m2p, v2b } } } @@ -219,147 +234,145 @@ impl StaticAction for EncapAction { action_meta: &mut ActionMeta, ) -> GenHtResult { let f_hash = flow_id.crc32(); + let dst_ip = flow_id.dst_ip(); - // The router layer determines a RouterTarget and stores it in - // the meta map. We need to map this virtual target to a - // physical one. - let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) { - Some(val) => val, - None => { - // This should never happen. The router should always - // write an entry. However, we currently have no way - // to enforce this in the type system, and thus must - // account for this situation. - return Err(GenHtError::Unexpected { - msg: "no RouterTarget metadata entry found".to_string(), - }); + // Multicast traffic is detected by checking if the inner + // destination IP is a multicast address. Multicast operates at the fleet + // level (cross-VPC) and doesn't go through VPC routing, so router + // metadata is not required in that case. + let is_mcast_addr = dst_ip.is_multicast(); + + let (is_internal, phys_target, is_mcast) = if is_mcast_addr { + // Multicast traffic: use M2P mapping to get the multicast underlay address. + // Fleet-level multicast mappings are stored in the dedicated `m2p`. + match self.m2p.get(&dst_ip) { + Some(underlay) => ( + true, + PhysNet { + ether: underlay.0.unchecked_multicast_mac(), + ip: underlay.0, + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + }, + true, + ), + None => { + // No M2P mapping configured for this multicast group; deny. + return Ok(AllowOrDeny::Deny); + } } - }; + } else { + // Non-multicast traffic: process through router target. - let target = match RouterTargetInternal::from_meta(target_str) { - Ok(val) => val, - Err(e) => { - return Err(GenHtError::Unexpected { + // The router layer determines a RouterTarget and stores it in + // the meta map. We need to map this virtual target to a + // physical one. + let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) + { + Some(val) => val, + None => { + return Err(GenHtError::Unexpected { + msg: "no RouterTarget metadata entry found".to_string(), + }); + } + }; + + let target = RouterTargetInternal::from_meta(target_str).map_err( + |e| GenHtError::Unexpected { msg: format!( "failed to parse metadata entry '{target_str}': {e}", ), - }); - } - }; + }, + )?; + + match target { + RouterTargetInternal::InternetGateway(_) => { + match self.v2b.get(&dst_ip) { + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = f_hash as usize; + let target = + match phys.iter().nth(hash % phys.len()) { + Some(target) => target, + None => return Ok(AllowOrDeny::Deny), + }; + ( + false, + PhysNet { + ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), + ip: target.ip, + vni: target.vni, + }, + false, + ) + } + None => return Ok(AllowOrDeny::Deny), + } + } - // Map the router target to a physical network location. - // The router layer has already made the routing decision - we just - // execute it here by looking up the appropriate physical mapping. - let dst_ip = flow_id.dst_ip(); - let (is_internal, phys_target, is_mcast) = match target { - RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&dst_ip) { - Some(phys) => { - // Hash the packet onto a route target. This is a very - // rudimentary mechanism. Should level-up to an ECMP - // algorithm with well known statistical properties. - let hash = f_hash as usize; - let target = match phys.iter().nth(hash % phys.len()) { - Some(target) => target, - None => return Ok(AllowOrDeny::Deny), - }; - ( - false, + RouterTargetInternal::Ip(virt_ip) => { + match self.v2p.get(&virt_ip) { + Some(phys) => ( + true, PhysNet { - ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), - ip: target.ip, - vni: target.vni, + ether: phys.ether, + ip: phys.ip, + vni: self.vni, }, false, - ) + ), + + // The router target has specified a VPC IP we do not + // currently know about; this could be for two + // reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's VPC, + // but we do not yet have a mapping for it. + // + // We cannot differentiate these cases from the point + // of view of this code without more information from + // the control plane; rather we drop the packet. If we + // are dealing with scenario (2), the control plane + // should eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), } - None => return Ok(AllowOrDeny::Deny), } - } - // Multicast target - use M2P mapping to get the multicast underlay address. - // The router has determined this packet should be multicast forwarded. - RouterTargetInternal::Multicast(_) => { - // Fleet-level multicast mappings live under DEFAULT_MULTICAST_VNI. - // Look up the underlay multicast IPv6 for this group using the - // global VPC mappings and encapsulate with the fleet multicast VNI. - let mvni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); - match self.vpc_map.get_mcast_underlay(mvni, dst_ip) { - Some(underlay) => ( - true, - PhysNet { - ether: underlay.dst_mac(), - ip: underlay.0, - vni: mvni, - }, - true, - ), - None => { - // No mapping configured for this group; deny. - return Ok(AllowOrDeny::Deny); + RouterTargetInternal::VpcSubnet(_) => { + match self.v2p.get(&flow_id.dst_ip()) { + Some(phys) => ( + true, + PhysNet { + ether: phys.ether, + ip: phys.ip, + vni: self.vni, + }, + false, + ), + + // The guest is attempting to contact a VPC IP we + // do not currently know about; this could be for + // two reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's + // VPC, but we do not yet have a mapping for + // it. + // + // We cannot differentiate these cases from the + // point of view of this code without more + // information from the control plane; rather we + // drop the packet. If we are dealing with + // scenario (2), the control plane should + // eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), } } } - - RouterTargetInternal::Ip(virt_ip) => match self - .vni_state - .v2p - .get(&virt_ip) - { - Some(phys) => ( - true, - PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni }, - false, - ), - - // The router target has specified a VPC IP we do not - // currently know about; this could be for two - // reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's VPC, - // but we do not yet have a mapping for it. - // - // We cannot differentiate these cases from the point - // of view of this code without more information from - // the control plane; rather we drop the packet. If we - // are dealing with scenario (2), the control plane - // should eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), - }, - - RouterTargetInternal::VpcSubnet(_) => { - match self.vni_state.v2p.get(&flow_id.dst_ip()) { - Some(phys) => ( - true, - PhysNet { - ether: phys.ether, - ip: phys.ip, - vni: self.vni, - }, - false, - ), - - // The guest is attempting to contact a VPC IP we - // do not currently know about; this could be for - // two reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's - // VPC, but we do not yet have a mapping for - // it. - // - // We cannot differentiate these cases from the - // point of view of this code without more - // information from the control plane; rather we - // drop the packet. If we are dealing with - // scenario (2), the control plane should - // eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), - } - } }; action_meta.set_internal_target(is_internal); @@ -371,10 +384,17 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; - // For multicast originated from this host, we set External replication. - // The actual replication scope will be determined by the mcast_fwd table. + // For multicast originated from this host, we seed the multicast Geneve + // option with `External` replication. XDE will then select the actual + // replication per next-hop based on the rack-wide forwarding table + // (mcast_fwd), which tells the switch which ports to replicate to + // (external, underlay, or bifurcated). + // + // Local same-sled delivery to subscribed guests is always performed by + // OPTE, independent of the replication mode (not an access control mechanism). + // // The first byte encodes Replication in the top 2 bits: - // External=0x00, Underlay=0x40, All=0x80, Reserved=0xC0 + // External=0x00, Underlay=0x40, Both=0x80, Reserved=0xC0 const REPLICATION_EXTERNAL_BYTE: u8 = (Replication::External as u8) << 6; static GENEVE_MCAST_OPT_BODY: &[u8] = &[ @@ -412,12 +432,7 @@ impl StaticAction for EncapAction { proto: Protocol::UDP, exts: Cow::Borrowed(&[]), }); - match Valid::validated(ip_push) { - Ok(v) => v, - Err(e) => { - return Err(e.into()); - } - } + Valid::validated(ip_push)? }), // XXX Geneve uses the UDP source port as a flow label // value for the purposes of ECMP -- a hash of the @@ -569,8 +584,71 @@ impl StaticAction for DecapAction { } } +/// Validate VNI for inbound multicast traffic in the overlay layer. +/// +/// All outbound multicast packets are currently encapsulated with VNI 77 +/// (DEFAULT_MULTICAST_VNI) for fleet-wide delivery. See [`EncapAction::gen_ht`]. +/// +/// ## Validation Policy on RX Path +/// This validator accepts multicast packets with either of two VNI values: +/// - **VNI 77 (DEFAULT_MULTICAST_VNI)**: Fleet-wide multicast, accepted by all +/// ports regardless of VPC. This enables rack-wide multicast delivery. +/// - **Guest's VPC VNI**: Enables per-VPC multicast isolation **in the future**. +/// +/// The validator enforces VPC isolation by rejecting multicast packets with +/// VNI values that don't match either the fleet-wide VNI or this port's VPC. +struct MulticastVniValidator { + my_vni: Vni, +} + +impl MulticastVniValidator { + fn new(vni: Vni) -> Self { + Self { my_vni: vni } + } +} + +impl MetaAction for MulticastVniValidator { + fn mod_meta( + &self, + flow: &InnerFlowId, + action_meta: &mut ActionMeta, + ) -> ModMetaResult { + // Only validate if this is multicast traffic + if !flow.dst_ip().is_multicast() { + return Ok(AllowOrDeny::Allow(())); + } + + // Check VNI from action metadata (set by DecapAction) + if let Some(vni_str) = action_meta.get(ACTION_META_VNI) + && let Ok(vni_val) = vni_str.parse::() + && let Ok(pkt_vni) = Vni::new(vni_val) + { + let mcast_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + // Allow if VNI matches this VPC or fleet-wide multicast VNI + if pkt_vni == self.my_vni || pkt_vni == mcast_vni { + return Ok(AllowOrDeny::Allow(())); + } + // VNI mismatch or parse error - deny + return Ok(AllowOrDeny::Deny); + } + // No VNI in metadata means external packet - allow + // (external packets don't have ACTION_META_VNI set per DecapAction logic) + Ok(AllowOrDeny::Allow(())) + } + + fn implicit_preds(&self) -> (Vec, Vec) { + (vec![], vec![]) + } +} + +impl fmt::Display for MulticastVniValidator { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "mcast-vni-validator") + } +} + pub struct VpcMappings { - inner: KMutex>>, + inner: KMutex>>, } impl VpcMappings { @@ -581,16 +659,16 @@ impl VpcMappings { /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. - pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { + pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { // We convert to GuestPhysAddr because it saves us from // redundant storage of the VNI. let guest_phys = GuestPhysAddr::from(phys); let mut lock = self.inner.lock(); - let state = lock.entry(phys.vni).or_default(); - state.v2p.set(vip, guest_phys); + let v2p = lock.entry(phys.vni).or_default(); + v2p.set(vip, guest_phys); - state.clone() + v2p.clone() } /// Delete the mapping for the given VIP in the given VNI. @@ -598,7 +676,7 @@ impl VpcMappings { /// Return the existing entry, if there is one. pub fn del(&self, vip: &IpAddr, phys: &PhysNet) -> Option { match self.inner.lock().get(&phys.vni) { - Some(state) => state.v2p.remove(vip).map(|guest_phys| PhysNet { + Some(v2p) => v2p.remove(vip).map(|guest_phys| PhysNet { ether: guest_phys.ether, ip: guest_phys.ip, vni: phys.vni, @@ -613,13 +691,11 @@ impl VpcMappings { let mut mappings = Vec::new(); let lock = self.inner.lock(); - for (vni, state) in lock.iter() { + for (vni, v2p) in lock.iter() { mappings.push(VpcMapResp { vni: *vni, - ip4: state.v2p.dump_ip4(), - ip6: state.v2p.dump_ip6(), - mcast_ip4: state.m2p.dump_ip4(), - mcast_ip6: state.m2p.dump_ip6(), + ip4: v2p.dump_ip4(), + ip6: v2p.dump_ip6(), }); } @@ -633,72 +709,14 @@ impl VpcMappings { /// assumption is enforced by the control plane; making sure that /// peered VPCs do not overlap their VIP ranges. pub fn ip_to_vni(&self, vip: &IpAddr) -> Option { - for (vni, state) in self.inner.lock().iter() { - if state.v2p.get(vip).is_some() { + for (vni, v2p) in self.inner.lock().iter() { + if v2p.get(vip).is_some() { return Some(*vni); } } None } - - /// Add a multicast forwarding entry from a multicast group IP to a physical - /// underlay IP. - /// - /// Returns an error if: - /// - The VNI is not DEFAULT_MULTICAST_VNI - /// - The underlay address is not a valid IPv6 multicast address - pub fn add_mcast( - &self, - group: IpAddr, - underlay: Ipv6Addr, - vni: Vni, - ) -> Result, OpteError> { - // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast - if vni.as_u32() != DEFAULT_MULTICAST_VNI { - return Err(OpteError::System { - errno: illumos_sys_hdrs::EINVAL, - msg: format!( - "multicast VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", - vni.as_u32() - ), - }); - } - - let mut lock = self.inner.lock(); - let state = lock.entry(vni).or_default(); - - let mcast_underlay = MulticastUnderlay::new(underlay).ok_or_else(|| { - OpteError::InvalidUnderlayMulticast(format!( - "underlay address must be an administratively-scoped multicast address \ - (scope 0x4/admin-local, 0x5/site-local, or 0x8/organization-local): {underlay}", - )) - })?; - - state.m2p.set(group, mcast_underlay); - Ok(state.clone()) - } - - /// Delete a multicast forwarding entry. - pub fn del_mcast(&self, group: IpAddr, _underlay: Ipv6Addr, vni: Vni) { - let mut lock = self.inner.lock(); - if let Some(state) = lock.get_mut(&vni) { - state.m2p.remove(&group); - } - } - - /// Get the underlay multicast for a given VNI and overlay multicast group. - pub fn get_mcast_underlay( - &self, - vni: Vni, - group: IpAddr, - ) -> Option { - let lock = self.inner.lock(); - lock.get(&vni).and_then(|state| match group { - IpAddr::Ip4(ip4) => state.m2p.ip4.lock().get(&ip4).copied(), - IpAddr::Ip6(ip6) => state.m2p.ip6.lock().get(&ip6).copied(), - }) - } } impl Default for VpcMappings { @@ -749,29 +767,15 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } -// NOTE: This is structurally similar to V2P mapping, but maps to MulticastUnderlay -// which wraps only an IPv6 address. The destination MAC is derived algorithmically -// from the IPv6 multicast address rather than stored explicitly. /// A mapping from inner multicast destination IPs to underlay multicast groups. /// -/// Validation is enforced through the `MulticastUnderlay` newtype wrapper, which -/// ensures only valid IPv6 multicast addresses can be stored. +/// Validation is enforced at the API boundary (see xde.rs set_m2p_hdlr) to ensure +/// only valid admin-local IPv6 multicast addresses (ff04::/16) are stored. pub struct Mcast2Phys { ip4: KMutex>, ip6: KMutex>, } -/// Per-VNI mapping state containing both unicast and multicast address mappings. -/// -/// This struct holds all address-to-physical mappings organized by VNI: -/// - `v2p`: Unicast virtual IPs to physical locations -/// - `m2p`: Multicast group IPs to physical underlay addresses -#[derive(Default)] -pub struct PerVniMaps { - pub v2p: Virt2Phys, - pub m2p: Mcast2Phys, -} - pub const TUNNEL_ENDPOINT_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { @@ -1021,35 +1025,12 @@ impl Default for Mcast2Phys { } } -/// An overlay multicast group address mapped to the underlay (outer) IPv6 multicast address. -/// -/// This type ensures that the wrapped IPv6 address is a valid multicast address -/// with administrative scope (admin-local, site-local, or organization-local). +/// Transparent wrapper for underlay IPv6 multicast addresses. /// -/// Administrative scopes per RFC 4291 and RFC 7346: -/// - `0x4`: admin-local scope -/// - `0x5`: site-local scope -/// - `0x8`: organization-local scope +/// This newtype exists only to satisfy the orphan rule for implementing +/// `ResourceEntry`. Validation is performed at the API boundary (xde.rs). #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct MulticastUnderlay(Ipv6Addr); - -impl MulticastUnderlay { - /// Create a new `MulticastUnderlay` if the address is a valid - /// administratively-scoped multicast IPv6 address (scope 0x4, 0x5, or 0x8). - pub fn new(addr: Ipv6Addr) -> Option { - if addr.is_admin_scoped_multicast() { Some(Self(addr)) } else { None } - } - - /// Return the underlying IPv6 multicast address. - pub fn addr(&self) -> Ipv6Addr { - self.0 - } - - /// Return the destination MAC address derived from the IPv6 multicast address. - fn dst_mac(&self) -> MacAddr { - self.0.unchecked_multicast_mac() - } -} +pub struct MulticastUnderlay(pub Ipv6Addr); impl Resource for Mcast2Phys {} impl ResourceEntry for MulticastUnderlay {} diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index 11263c63..a42b8120 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -65,7 +65,6 @@ pub enum RouterTargetInternal { InternetGateway(Option), Ip(IpAddr), VpcSubnet(IpCidr), - Multicast(IpCidr), } impl RouterTargetInternal { @@ -87,7 +86,6 @@ impl RouterTargetInternal { } RouterTargetInternal::Ip(_) => RouterTargetClass::Ip, RouterTargetInternal::VpcSubnet(_) => RouterTargetClass::VpcSubnet, - RouterTargetInternal::Multicast(_) => RouterTargetClass::Multicast, } } } @@ -119,16 +117,6 @@ impl ActionMetaValue for RouterTargetInternal { Ok(Self::VpcSubnet(IpCidr::Ip6(cidr6))) } - Some(("mcast4", cidr4_s)) => { - let cidr4 = cidr4_s.parse::()?; - Ok(Self::Multicast(IpCidr::Ip4(cidr4))) - } - - Some(("mcast6", cidr6_s)) => { - let cidr6 = cidr6_s.parse::()?; - Ok(Self::Multicast(IpCidr::Ip6(cidr6))) - } - Some(("ig", ig)) => { let ig = ig.parse::().map_err(|e| e.to_string())?; Ok(Self::InternetGateway(Some(ig))) @@ -153,12 +141,6 @@ impl ActionMetaValue for RouterTargetInternal { Self::VpcSubnet(IpCidr::Ip6(cidr6)) => { format!("sub6={cidr6}").into() } - Self::Multicast(IpCidr::Ip4(mcast4)) => { - format!("mcast4={mcast4}").into() - } - Self::Multicast(IpCidr::Ip6(mcast6)) => { - format!("mcast6={mcast6}").into() - } } } } @@ -169,7 +151,6 @@ impl fmt::Display for RouterTargetInternal { Self::InternetGateway(addr) => format!("IG({addr:?})"), Self::Ip(addr) => format!("IP: {addr}"), Self::VpcSubnet(sub) => format!("Subnet: {sub}"), - Self::Multicast(mcast) => format!("Multicast: {mcast}"), }; write!(f, "{s}") } @@ -180,7 +161,6 @@ pub enum RouterTargetClass { InternetGateway, Ip, VpcSubnet, - Multicast, } impl ActionMetaValue for RouterTargetClass { @@ -191,7 +171,6 @@ impl ActionMetaValue for RouterTargetClass { "ig" => Ok(Self::InternetGateway), "ip" => Ok(Self::Ip), "subnet" => Ok(Self::VpcSubnet), - "mcast" => Ok(Self::Multicast), _ => Err(format!("bad router target class: {s}")), } } @@ -201,7 +180,6 @@ impl ActionMetaValue for RouterTargetClass { Self::InternetGateway => "ig".into(), Self::Ip => "ip".into(), Self::VpcSubnet => "subnet".into(), - Self::Multicast => "mcast".into(), } } } @@ -212,7 +190,6 @@ impl fmt::Display for RouterTargetClass { Self::InternetGateway => write!(f, "IG"), Self::Ip => write!(f, "IP"), Self::VpcSubnet => write!(f, "Subnet"), - Self::Multicast => write!(f, "Multicast"), } } } @@ -290,7 +267,27 @@ pub fn setup( default_out: DefaultAction::Deny, }; - let layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + let mut layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + + // Allow IPv6 multicast (ff00::/8) to bypass route lookup. + // Multicast operates fleet-wide via M2P mappings, not through VPC routing. + // The overlay addresses use any valid multicast prefix; underlay restriction + // to ff04::/16 is enforced by M2P mapping validation. + let mut mcast_out = + Rule::new(0, Action::Meta(Arc::new(MulticastPassthrough))); + mcast_out.add_predicate(Predicate::InnerDstIp6(vec![ + Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + + // Allow IPv4 multicast (224.0.0.0/4) to bypass route lookup. + let mut mcast_out_v4 = + Rule::new(0, Action::Meta(Arc::new(MulticastPassthrough))); + mcast_out_v4.add_predicate(Predicate::InnerDstIp4(vec![ + Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST), + ])); + layer.add_rule(Direction::Out, mcast_out_v4.finalize()); + pb.add_layer(layer, Pos::After(fw::FW_LAYER_NAME)) } @@ -301,8 +298,6 @@ fn valid_router_dest_target_pair(dest: &IpCidr, target: &RouterTarget) -> bool { (_, RouterTarget::Drop) | // Internet gateways are valid for any IP family. (_, RouterTarget::InternetGateway(_)) | - // Multicast targets are valid for any IP family - (_, RouterTarget::Multicast(_)) | // IPv4 destination, IPv4 address (IpCidr::Ip4(_), RouterTarget::Ip(IpAddr::Ip4(_))) | // IPv4 destination, IPv4 subnet @@ -319,6 +314,22 @@ fn make_rule( target: RouterTarget, class: RouterClass, ) -> Result, OpteError> { + // Reject router entries with multicast destination CIDRs. + // Multicast operates fleet-wide via M2P mappings and subscriptions, + // not through VPC routing. Router layer allows multicast through + // unconditionally without route lookup. + let is_mcast_dst = match dest { + IpCidr::Ip4(cidr) => cidr.ip().is_multicast(), + IpCidr::Ip6(cidr) => cidr.ip().is_multicast(), + }; + if is_mcast_dst { + return Err(OpteError::InvalidRouterEntry { + dest, + target: "multicast destinations not allowed in router entries" + .to_string(), + }); + } + if !valid_router_dest_target_pair(&dest, &target) { return Err(OpteError::InvalidRouterEntry { dest, @@ -387,22 +398,6 @@ fn make_rule( ))); (predicate, action) } - - RouterTarget::Multicast(mcast) => { - let predicate = match dest { - IpCidr::Ip4(ip4) => { - Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(ip4)]) - } - - IpCidr::Ip6(ip6) => { - Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(ip6)]) - } - }; - let action = Action::Meta(Arc::new(RouterAction::new( - RouterTargetInternal::Multicast(mcast), - ))); - (predicate, action) - } }; let priority = compute_rule_priority(&dest, class); @@ -462,6 +457,29 @@ pub fn replace( Ok(NoResp::default()) } +/// Passthrough action for multicast traffic that bypasses route lookup. +struct MulticastPassthrough; + +impl fmt::Display for MulticastPassthrough { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "multicast-passthrough") + } +} + +impl MetaAction for MulticastPassthrough { + fn implicit_preds(&self) -> (Vec, Vec) { + (vec![], vec![]) + } + + fn mod_meta( + &self, + _flow_id: &InnerFlowId, + _meta: &mut ActionMeta, + ) -> ModMetaResult { + Ok(AllowOrDeny::Allow(())) + } +} + // TODO I may want to have different types of rule/flow tables a layer // can have. Up to this point the tables consist of `Rule` entires; // matching arbitrary header predicates to a `RuleAction`. I may want diff --git a/lib/oxide-vpc/src/print.rs b/lib/oxide-vpc/src/print.rs index f69a8b4c..5a014702 100644 --- a/lib/oxide-vpc/src/print.rs +++ b/lib/oxide-vpc/src/print.rs @@ -10,6 +10,7 @@ //! can be used by both opteadm and integration tests. use crate::api::DumpMcastForwardingResp; +use crate::api::DumpMcastSubscriptionsResp; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; @@ -164,10 +165,42 @@ pub fn print_mcast_fwd_into( writeln!( t, "{}\t{}\t{}\t{replication:?}", - entry.group, next_hop.addr, next_hop.vni + entry.underlay, next_hop.addr, next_hop.vni )?; } } writeln!(t)?; t.flush() } + +/// Print the header for the [`print_mcast_subs()`] output. +fn print_mcast_subs_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "UNDERLAY GROUP\tSUBSCRIBED PORTS") +} + +/// Print a [`DumpMcastSubscriptionsResp`]. +pub fn print_mcast_subs( + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + print_mcast_subs_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastSubscriptionsResp`] into a given writer. +pub fn print_mcast_subs_into( + writer: &mut impl Write, + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Subscriptions")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_subs_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + let ports = entry.ports.join(", "); + writeln!(t, "{}\t{ports}", entry.underlay)?; + } + writeln!(t)?; + t.flush() +} diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index 57a1d541..8b8a45e5 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -47,6 +47,7 @@ use opte::engine::parse::ValidUlp; use opte::engine::port::DropReason; use opte::engine::port::ProcessError; use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; @@ -498,7 +499,7 @@ fn guest_to_guest_no_route() { g1.vpc_map.add(g2_cfg.ipv4().private_ip.into(), g2_cfg.phys_addr()); g1.port.start(); set!(g1, "port_state=running"); - // Make sure the router is configured to drop all packets. + // Make sure the router is configured to drop all packets except multicast. router::del_entry( &g1.port, IpCidr::Ip4(g1_cfg.ipv4().vpc_subnet), @@ -506,7 +507,7 @@ fn guest_to_guest_no_route() { RouterClass::System, ) .unwrap(); - update!(g1, ["incr:epoch", "set:router.rules.out=0"]); + update!(g1, ["incr:epoch", "set:router.rules.out=2"]); let mut pkt1_m = http_syn(&g1_cfg, &g2_cfg); let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt1); @@ -2543,8 +2544,8 @@ fn test_gateway_neighbor_advert_reply() { .unwrap_or_else(|| String::from("Drop")); panic!( "Generated unexpected packet from NS: {}\n\ - Result: {:?}\nExpected: {}", - d.ns, res, na, + Result: {res:?}\nExpected: {na}", + d.ns ); } }; @@ -4837,24 +4838,19 @@ fn test_ipv6_multicast_encapsulation() { ]); // Add multicast forwarding entry BEFORE starting the port - let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); - g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + g1.m2p.set( + mcast_dst.into(), + oxide_vpc::engine::overlay::MulticastUnderlay(mcast_underlay), + ); g1.port.start(); set!(g1, "port_state=running"); - // Add router entry for IPv6 multicast traffic (ff00::/8) via Multicast target - router::add_entry( - &g1.port, - IpCidr::Ip6("ff00::/8".parse().unwrap()), - RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), - RouterClass::System, - ) - .unwrap(); - incr!(g1, ["epoch", "router.rules.out"]); + // Multicast traffic is now detected automatically by checking if the destination + // IP is a multicast address. No router entries are needed for multicast since it + // operates at the fleet level (cross-VPC) rather than within VPC routing. // Build a UDP packet to the multicast address - // (TCP + multicast is incompatible and would be denied) let eth = Ethernet { destination: MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), source: g1_cfg.guest_mac, @@ -4877,10 +4873,10 @@ fn test_ipv6_multicast_encapsulation() { let mut pkt_m = ulp_pkt(eth, ip, udp, &[]); let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); - let res = g1.port.process(Out, pkt); + let res = g1.port.process(Out, pkt).expect("process should succeed"); // Verify packet was encapsulated - let Ok(Modified(spec)) = res else { + let Modified(spec) = res else { panic!("Expected Modified result, got {res:?}"); }; let mut pkt_m = spec.apply(pkt_m); @@ -4904,7 +4900,8 @@ fn test_ipv6_multicast_encapsulation() { ); // Verify the outer Ethernet destination MAC is the IPv6 multicast MAC - // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the last 4 bytes of the IPv6 address + // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the + // last 4 bytes of the IPv6 address let expected_outer_mac = mcast_underlay.multicast_mac().unwrap(); assert_eq!( meta.outer_eth.destination(), @@ -4915,7 +4912,7 @@ fn test_ipv6_multicast_encapsulation() { // Verify we have Geneve encapsulation with the correct VNI (fleet multicast VNI) assert_eq!( meta.outer_encap.vni(), - mcast_vni, + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(), "Geneve VNI should match DEFAULT_MULTICAST_VNI" ); @@ -4946,21 +4943,14 @@ fn test_tcp_multicast_denied() { 0x00, 0x01, 0xff, 0xff, ]); - let mcast_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); - g1.vpc_map.add_mcast(mcast_dst.into(), mcast_underlay, mcast_vni).unwrap(); + g1.m2p.set( + mcast_dst.into(), + oxide_vpc::engine::overlay::MulticastUnderlay(mcast_underlay), + ); g1.port.start(); set!(g1, "port_state=running"); - router::add_entry( - &g1.port, - IpCidr::Ip6("ff00::/8".parse().unwrap()), - RouterTarget::Multicast(IpCidr::Ip6("ff00::/8".parse().unwrap())), - RouterClass::System, - ) - .unwrap(); - incr!(g1, ["epoch", "router.rules.out"]); - // Build a TCP packet to the multicast address (should be denied) let mut pkt_m = http_syn3( g1_cfg.guest_mac, @@ -4975,14 +4965,13 @@ fn test_tcp_multicast_denied() { let res = g1.port.process(Out, pkt); // Verify packet was denied (TCP + multicast is incompatible) - match res { - Ok(Hairpin(_)) => panic!("Expected packet to be denied, got Hairpin"), - Ok(Modified(_)) => panic!("Expected packet to be denied, got Modified"), - Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) => { - // Expected - TCP + multicast is denied by overlay layer - } - other => panic!("Expected Drop with Layer reason, got: {:?}", other), - } + assert!( + matches!( + res, + Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) + ), + "Expected Drop with Layer reason, got: {res:?}" + ); } // Ensure packets with unknown critical Geneve options are rejected during diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index d2908fe3..0b4a25bd 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -53,6 +53,47 @@ use std::time::Instant; use zone::Zlogin; pub use ztest::*; +/// Ensure a zone with the given name is not present. +/// +/// Best-effort: attempt halt and uninstall, then poll until the zone +/// disappears from `zoneadm list -cv` (bounded timeout). +fn ensure_zone_absent(name: &str) -> Result<()> { + // Try to halt if running; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "halt"]) + .stderr(Stdio::null()) + .status(); + + // Try to uninstall; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "uninstall", "-F"]) + .stderr(Stdio::null()) + .status(); + + // Poll for disappearance up to 10 seconds + let deadline = Instant::now() + Duration::from_secs(10); + loop { + let out = Command::new("pfexec") + .arg("zoneadm") + .args(["list", "-cv"]) + .output()?; + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + if !stdout.contains(name) { + break; + } + if Instant::now() >= deadline { + bail!( + "zone '{name}' still present after uninstall attempts; stdout: {stdout}" + ); + } + std::thread::sleep(Duration::from_millis(100)); + } + + Ok(()) +} + /// The IPv4 overlay network used in all tests. pub const OVERLAY_NET: &str = "10.0.0.0/24"; /// The IPv4 overlay OPTE gateway used in all tests. @@ -73,6 +114,9 @@ impl OpteZone { /// of interfaces. In illumos parlance, the interfaces are data link /// devices. fn new(name: &str, zfs: &Zfs, ifx: &[&str], brand: &str) -> Result { + // Ensure any prior zone with this name is fully removed before creating + // a new one, to avoid flakes from leftover state. + let _ = ensure_zone_absent(name); let zone = Zone::new(name, brand, zfs, ifx, &[])?; Ok(Self { zone }) } @@ -80,11 +124,15 @@ impl OpteZone { /// Wait for the network to come up, then set up the IPv4 overlay network. fn setup(&self, devname: &str, addr: String) -> Result<()> { self.zone.wait_for_network()?; - // Configure IPv4 via DHCP - self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/test"))?; + // Configure IPv4 with static address (immediate, no DHCP wait) + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {addr}/24 {devname}/test" + ))?; + self.zone.zexec(&format!("route add -iface {OVERLAY_GW} {addr}"))?; self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + // Add multicast route so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; Ok(()) } @@ -96,9 +144,10 @@ impl OpteZone { ipv6_addr: String, ) -> Result<()> { self.zone.wait_for_network()?; - // Configure IPv4 via DHCP - self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/testv4"))?; + // Configure IPv4 with static address (immediate, no DHCP wait) + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {ipv4_addr}/24 {devname}/testv4" + ))?; self.zone .zexec(&format!("route add -iface {OVERLAY_GW} {ipv4_addr}"))?; self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; @@ -119,6 +168,41 @@ impl OpteZone { self.zone.zexec(&format!( "route add -inet6 {OVERLAY_NET_V6} {OVERLAY_GW_V6}" ))?; + // Add multicast routes so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; + self.zone + .zexec(&format!("route add -inet6 ff04::/16 {OVERLAY_GW_V6}"))?; + Ok(()) + } + + /// Send a single UDP packet (IPv4) from this zone using netcat. + /// Pins the source address with `-s` for deterministic egress selection. + pub fn send_udp_v4( + &self, + src_ip: &str, + dst_ip: &str, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; + Ok(()) + } + + /// Send a single UDP packet (IPv6) from this zone using netcat. + /// Uses `-s` with the IPv6 source for deterministic egress. + /// Avoids `-6` for illumos netcat compatibility (destination selects family). + pub fn send_udp_v6( + &self, + src_ip: &str, + dst_ip: &str, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; Ok(()) } } @@ -227,7 +311,7 @@ impl OptePort { let adm = OpteHdl::open()?; adm.add_router_entry(&AddRouterEntryReq { port_name: self.name.clone(), - dest: IpCidr::Ip4(format!("{}/32", dest).parse().unwrap()), + dest: IpCidr::Ip4(format!("{dest}/32").parse().unwrap()), target: RouterTarget::Ip(dest.parse().unwrap()), class: RouterClass::System, })?; @@ -308,15 +392,14 @@ impl OptePort { Ok(()) } - /// Add a multicast router entry for this port. + /// Allow multicast CIDR traffic for this port. + /// + /// Multicast is handled automatically by the gateway layer, so we just + /// need to allow the CIDR through the firewall in both directions. pub fn add_multicast_router_entry(&self, cidr: IpCidr) -> Result<()> { - let adm = OpteHdl::open()?; - adm.add_router_entry(&AddRouterEntryReq { - port_name: self.name.clone(), - dest: cidr, - target: RouterTarget::Multicast(cidr), - class: RouterClass::System, - })?; + // Allow multicast traffic in both directions + self.allow_cidr(cidr, Direction::In)?; + self.allow_cidr(cidr, Direction::Out)?; Ok(()) } @@ -334,12 +417,14 @@ impl Drop for OptePort { let adm = match OpteHdl::open() { Ok(adm) => adm, Err(e) => { - eprintln!("failed to open xde device on drop: {}", e); + eprintln!("failed to open xde device on drop: {e}"); return; } }; // Clean up multicast subscriptions + // Note: unsubscribe is now idempotent with respect to M2P mappings, + // so we only need to handle actual errors (e.g., port doesn't exist) let subscriptions = self.mcast_subscriptions.borrow().clone(); for group in subscriptions { if let Err(e) = adm.mcast_unsubscribe(&McastUnsubscribeReq { @@ -391,8 +476,33 @@ impl Drop for Xde { fn drop(&mut self) { // Clear underlay to release references to simnet/vnic devices, // allowing their cleanup to proceed. Driver remains loaded. + // + // Retry with backoff if EBUSY (in-flight TX may briefly hold refs). + // After cache clearing + siphon quiesce, refs should drain quickly. if let Ok(adm) = OpteHdl::open() { - let _ = adm.clear_xde_underlay(); + for attempt in 1..=10 { + match adm.clear_xde_underlay() { + Ok(_) => { + if attempt > 1 { + eprintln!( + "clear_xde_underlay succeeded on attempt {attempt}" + ); + } + return; + } + Err(e) if e.to_string().contains("EBUSY") => { + eprintln!( + "clear_xde_underlay returned EBUSY on attempt {attempt}/10; retrying after 10ms" + ); + std::thread::sleep(Duration::from_millis(10)); + } + Err(e) => { + eprintln!("failed to clear xde underlay: {e}"); + return; + } + } + } + eprintln!("failed to clear xde underlay after 10 retries (EBUSY)"); } } } @@ -406,12 +516,13 @@ pub struct SnoopGuard { } impl SnoopGuard { - /// Start a `snoop` capture on `dev_name` with the provided BPF-like `filter`. + /// Start a `snoop` capture on `dev_name` with the provided packet `filter`. + /// Filter syntax matches snoop conventions (e.g., "udp and port 5353"). /// Captures a single packet (`-c 1`) and dumps hex output (`-x0`). /// Uses `-r` to disable name resolution for deterministic numeric output. pub fn start(dev_name: &str, filter: &str) -> anyhow::Result { let child = Command::new("pfexec") - .args(&[ + .args([ "snoop", "-r", "-d", dev_name, "-c", "1", "-P", "-x0", filter, ]) .stdout(Stdio::piped()) @@ -451,32 +562,56 @@ impl SnoopGuard { impl Drop for SnoopGuard { fn drop(&mut self) { - if let Some(child) = &mut self.child { - if let Ok(None) = child.try_wait() { - let _ = child.kill(); - let _ = child.wait(); - } + if let Some(child) = &mut self.child + && let Ok(None) = child.try_wait() + { + let _ = child.kill(); + let _ = child.wait(); + } + } +} + +/// Ensure the host has an IPv6 multicast route for admin-local scope +/// (ff04::/16) pointing to the provided interface. This helps the underlay +/// forwarding tests route multicast packets deterministically. +/// +/// Returns Ok even if the route already exists or if the command fails at +/// runtime; logs a warning on non-successful route add attempts. +pub fn ensure_underlay_admin_scoped_route_v6(interface: &str) -> Result<()> { + let out = std::process::Command::new("pfexec") + .args(["route", "add", "-inet6", "ff04::/16", "-iface", interface]) + .output()?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + // Treat "File exists" as benign; otherwise, just warn and continue. + if !stderr.to_lowercase().contains("file exists") { + eprintln!( + "Warning: failed to add IPv6 multicast route ff04::/16 on {interface}: {stderr}" + ); } } + Ok(()) } /// Global multicast group state that cleans up M2P mappings and forwarding /// entries on drop. Port-specific subscriptions are handled automatically by -/// OptePort::drop(). +/// [`OptePort::drop()`]. /// /// Use this to set up multicast groups in tests. Port subscriptions should use /// `port.subscribe_multicast(group)` which tracks cleanup automatically. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. pub struct MulticastGroup { pub group: IpAddr, pub underlay: Ipv6Addr, - pub vni: Vni, } impl MulticastGroup { - pub fn new(group: IpAddr, underlay: Ipv6Addr, vni: Vni) -> Result { + pub fn new(group: IpAddr, underlay: Ipv6Addr) -> Result { let hdl = OpteHdl::open()?; - hdl.set_m2p(&SetMcast2PhysReq { group, underlay, vni })?; - Ok(Self { group, underlay, vni }) + hdl.set_m2p(&SetMcast2PhysReq { group, underlay })?; + Ok(Self { group, underlay }) } /// Set multicast forwarding entries for this group. @@ -489,7 +624,7 @@ impl MulticastGroup { ) -> Result<()> { let hdl = OpteHdl::open()?; hdl.set_mcast_fwd(&SetMcastForwardingReq { - group: self.group, + underlay: self.underlay, next_hops, })?; Ok(()) @@ -504,18 +639,20 @@ impl Drop for MulticastGroup { }; // Clear forwarding entry - let group = self.group; - if let Err(e) = - hdl.clear_mcast_fwd(&ClearMcastForwardingReq { group: self.group }) - { - eprintln!("failed to clear multicast forwarding for {group}: {e}"); + let underlay = self.underlay; + if let Err(e) = hdl.clear_mcast_fwd(&ClearMcastForwardingReq { + underlay: self.underlay, + }) { + eprintln!( + "failed to clear multicast forwarding for {underlay}: {e}" + ); } // Clear M2P mapping + let group = self.group; if let Err(e) = hdl.clear_m2p(&ClearMcast2PhysReq { group: self.group, underlay: self.underlay, - vni: self.vni, }) { eprintln!("failed to clear M2P mapping for {group}: {e}"); } @@ -886,7 +1023,7 @@ pub fn get_linklocal_addr(link_name: &str) -> Result { let text = std::str::from_utf8(&out.stdout)?; if !out.status.success() || text.lines().count() == 1 { - bail!("could not find address {target_addr}"); + anyhow::bail!("could not find address {target_addr}"); } let mut maybe_addr = text @@ -919,7 +1056,7 @@ pub fn single_node_over_real_nic( let max_macs = (1 << 20) - peers.len() - 1; if null_port_count > max_macs as u32 { - bail!( + anyhow::bail!( "Cannot allocate {null_port_count} ports: \ Oxide MAC space admits {max_macs} accounting for peers" ); @@ -930,7 +1067,7 @@ pub fn single_node_over_real_nic( // This is an absurd preallocation (~6MiB?) -- but it is deterministic, // and if we want to test A Lot of ports then we can. let forbidden_macs: HashSet<_> = - (&[my_info]).iter().chain(peers).map(|v| v.mac).collect(); + [my_info].iter().chain(peers).map(|v| v.mac).collect(); let mut usable_macs: Vec = (0..(1 << 20)) .filter_map(|n: u32| { let raw = n.to_be_bytes(); @@ -958,7 +1095,7 @@ pub fn single_node_over_real_nic( // VIP reuse is not an issue, we aren't using these ports for communication. null_ports.push(OptePort::new( &format!("opte{}", null_ports.len()), - &"172.20.0.1", + "172.20.0.1", &taken_mac, &underlay_addr, )?); diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs index 64f586d9..978b86c3 100644 --- a/xde-tests/tests/multicast_multi_sub.rs +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -5,8 +5,19 @@ // Copyright 2025 Oxide Computer Company //! XDE multicast multiple subscriber tests. +//! +//! These validate TX fanout and forwarding semantics across replication modes: +//! - Same-sled delivery (DELIVER action) is based purely on subscriptions and +//! independent of Replication mode set for TX. +//! - External replication sends Geneve to the multicast underlay address for +//! delivery to the boundary switch, which then replicates to front-panel ports. +//! - Underlay replication sends Geneve to ff04::/16 multicast address for +//! sled-to-sled delivery; receiving sleds perform same-sled delivery based on +//! local subscriptions. +//! - "Both" replication instructs TX to set bifurcated replication flags +//! (External + Underlay) in the Geneve header for switch-side handling, while +//! same-sled delivery still occurs independently based on subscriptions. -use anyhow::Context; use anyhow::Result; use opte_ioctl::OpteHdl; use opte_test_utils::geneve_verify; @@ -40,18 +51,23 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { ]); // Set up multicast state with automatic cleanup on drop - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; - // Node B's underlay address for forwarding - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); - - // Set up multicast forwarding with External replication - // This will deliver to all local subscribers in the same VNI + // Use node B's underlay address as the switch unicast address for routing. + // + // Note: This is a single-sled test - all nodes share one underlay. + // In production, XDE would route toward this switch address to determine the + // underlay port/MAC, but the packet dst would be the multicast address. + // This test validates packet formatting, not actual multi-sled routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up TX forwarding with External replication mode. + // TX behavior: packet sent to underlay with Replication::External flag. + // In production, switch receives this flag and replicates to front-panel ports. + // RX behavior: same-sled delivery is controlled by subscriptions, independent + // of the Replication mode. mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), + NextHopV6::new(fake_switch_addr, vni), Replication::External, )])?; @@ -59,9 +75,30 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); for node in &topol.nodes { node.port.add_multicast_router_entry(mcast_cidr)?; - node.port.subscribe_multicast(mcast_group.into())?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); } + // Assert subscription table reflects all three subscribers + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + // Start snoops on nodes B and C using SnoopGuard let dev_name_b = topol.nodes[1].port.name().to_string(); let dev_name_c = topol.nodes[2].port.name().to_string(); @@ -70,30 +107,25 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; - // Also snoop underlay to verify NO underlay forwarding with External mode + // Also snoop underlay to verify unicast Geneve TX to boundary let underlay_dev = "xde_test_sim1"; let mut snoop_underlay = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; // Send multicast packet from node A let payload = "fanout test"; - let send_cmd = - format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd) - .context("Failed to send multicast UDP packet")?; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; // Wait for both snoops to capture packets - let snoop_output_b = snoop_b - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for snoop on node B")?; - let snoop_output_c = snoop_c - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for snoop on node C")?; - - // Verify both nodes received the packet + let snoop_output_b = snoop_b.wait_with_timeout(Duration::from_secs(5))?; + let snoop_output_c = snoop_c.wait_with_timeout(Duration::from_secs(5))?; + let stdout_b = String::from_utf8_lossy(&snoop_output_b.stdout); assert!( snoop_output_b.status.success() && stdout_b.contains("UDP"), @@ -106,14 +138,42 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { "Expected to capture multicast UDP packet on node C, snoop output:\n{stdout_c}" ); - // Verify NO underlay forwarding (External mode = local-only) - if let Ok(output) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) - { - let stdout = String::from_utf8_lossy(&output.stdout); - panic!( - "External mode should NOT forward to underlay, but captured:\n{stdout}" - ); - } + // Verify underlay multicast forwarding (External mode) + // Parse the captured Geneve packet and assert: + // - VNI == DEFAULT_MULTICAST_VNI + // - Outer IPv6 dst == mcast_underlay (multicast group) + // - Replication == External + // Note: In production, the switch would see this External tag and replicate + // to front panel. This test verifies the Geneve header is correctly formed. + let snoop_underlay_out = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + let stdout_underlay = String::from_utf8_lossy(&snoop_underlay_out.stdout); + assert!( + snoop_underlay_out.status.success() && stdout_underlay.contains("UDP"), + "Expected to capture Geneve packet on underlay for External replication, output:\n{stdout_underlay}" + ); + + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, mcast_underlay, + "External replication should use multicast address (outer IPv6 dst)" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::External), + "Geneve replication mode should be External" + ); Ok(()) } @@ -134,61 +194,42 @@ fn test_multicast_underlay_replication() -> Result<()> { 224, 1, 2, 4, ]); - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; - // Debug: dump V2P/M2P mappings to verify M2P is set correctly let hdl = OpteHdl::open()?; - let v2p_dump = hdl.dump_v2p()?; - println!("\n=== V2P/M2P Mappings ==="); - for vpc_map in &v2p_dump.mappings { - println!(" VNI {}: ", vpc_map.vni.as_u32()); - println!(" Unicast IPv4 mappings: {:?}", vpc_map.ip4); - println!(" Multicast IPv4 mappings: {:?}", vpc_map.mcast_ip4); - println!(" Multicast IPv6 mappings: {:?}", vpc_map.mcast_ip6); - } - println!("=== End V2P/M2P Mappings ===\n"); - // Node B's underlay address - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up multicast forwarding with Underlay replication ONLY - // This should forward to underlay but NOT deliver to local ports + // Set up TX forwarding with Underlay replication mode. + // TX behavior: forward to underlay with multicast encapsulation. + // RX behavior: same-sled delivery to subscribers (none in this test). mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), + NextHopV6::new(fake_switch_addr, vni), Replication::Underlay, )])?; // Allow IPv4 multicast traffic via Multicast target // - // Note: We deliberately do NOT subscribe any nodes to verify that Underlay mode - // forwards to underlay regardless of local subscription state (zero subscribers) + // Note: We deliberately do NOT subscribe any nodes. This tests TX forwarding + // with zero local subscribers (RX delivery is based on subscriptions, not + // Replication) let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); for node in &topol.nodes { node.port.add_multicast_router_entry(mcast_cidr)?; } + // Assert there are no local subscribers for this group + let subs = hdl.dump_mcast_subs()?; + assert!( + !subs.entries.iter().any(|e| e.underlay == mcast_underlay), + "expected no local subscribers for {mcast_underlay}, got: {:?}", + subs.entries + ); + // Add IPv6 multicast route for admin-scoped multicast (ff04::/16) // This tells the kernel to route multicast packets through the underlay interface - let route_add_result = std::process::Command::new("pfexec") - .args(&[ - "route", - "add", - "-inet6", - "ff04::/16", - "-interface", - "xde_test_vnic0", - ]) - .output() - .context("Failed to add IPv6 multicast route")?; - if !route_add_result.status.success() { - println!( - "Warning: Failed to add IPv6 multicast route: {}", - String::from_utf8_lossy(&route_add_result.stderr) - ); - } + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; // Start snoop on the UNDERLAY simnet device (not the OPTE port) // to verify the packet is forwarded to the underlay @@ -196,16 +237,6 @@ fn test_multicast_underlay_replication() -> Result<()> { let mut snoop_underlay = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; // Geneve port - // Debug: dump forwarding table to verify configuration - let mfwd = hdl.dump_mcast_fwd()?; - println!("\n=== Multicast forwarding table (Underlay test) ==="); - for entry in &mfwd.entries { - println!( - " Group: {:?}, Next hops: {:?}", - entry.group, entry.next_hops - ); - } - // Also snoop node B's OPTE port to verify NO local delivery with Underlay mode let dev_name_b = topol.nodes[1].port.name().to_string(); let filter_local = @@ -217,20 +248,18 @@ fn test_multicast_underlay_replication() -> Result<()> { // Send multicast packet from node A let payload = "underlay test"; - let send_cmd = - format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd) - .context("Failed to send multicast UDP packet")?; - - // Wait for snoop to capture the underlay packet - let snoop_output_underlay = snoop_underlay - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for snoop on underlay")?; - - // Verify packet was forwarded to underlay + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for snoop to capture the underlay packet (one send expected) + let snoop_output_underlay = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + let stdout_underlay = String::from_utf8_lossy(&snoop_output_underlay.stdout); @@ -265,11 +294,25 @@ fn test_multicast_underlay_replication() -> Result<()> { "Geneve replication mode should be Underlay" ); - // Verify NO local delivery (Underlay mode = remote-only) + // Verify NO same-sled delivery (no subscribers = no delivery) + // Note: RX delivery is independent of Replication mode - it's based on subscriptions if let Ok(output) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { let stdout = String::from_utf8_lossy(&output.stdout); panic!( - "Underlay mode should NOT deliver locally, but captured:\n{stdout}" + "Expected no same-sled delivery (zero subscribers), but captured:\n{stdout}" + ); + } + + // Leaf-only RX assertion: start a second underlay snoop and ensure there + // is no additional multicast re-relay after RX. We expect only the single + // TX underlay packet captured above. + let mut snoop_underlay_2 = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + if let Ok(out) = snoop_underlay_2.wait_with_timeout(Duration::from_secs(2)) + { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "Expected leaf-only RX (no further underlay relay), got:\n{stdout}" ); } @@ -277,8 +320,9 @@ fn test_multicast_underlay_replication() -> Result<()> { } #[test] -fn test_multicast_all_replication() -> Result<()> { - // Create 3-node topology to test All replication mode (bifurcated delivery) +fn test_multicast_both_replication() -> Result<()> { + // Test "Both" replication mode: validates that egress TX (External + Underlay) + // and local same-sled delivery both occur. let topol = xde_tests::three_node_topology_named("omicron1", "ara", "arb", "arc")?; @@ -293,28 +337,50 @@ fn test_multicast_all_replication() -> Result<()> { 224, 1, 2, 5, ]); - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; - // Node B's underlay address for underlay forwarding - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up multicast forwarding with All replication - // This should deliver BOTH to local subscribers AND forward to underlay + // Set up TX forwarding with "Both" replication (drives egress encapsulation only) + // TX behavior: packet sent to underlay with Replication::Both flag set. + // In production, switch receives this and bifurcates: External (to front panel) + // + Underlay (sled-to-sled multicast). + // RX behavior: same-sled local delivery occurs independently, driven purely by + // port subscriptions (not the replication mode). mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), - Replication::All, + NextHopV6::new(fake_switch_addr, vni), + Replication::Both, )])?; // Allow IPv4 multicast traffic via Multicast target and subscribe to the group let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); for node in &topol.nodes { node.port.add_multicast_router_entry(mcast_cidr)?; - node.port.subscribe_multicast(mcast_group.into())?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); } + // Assert subscription table reflects all three subscribers + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + // Start snoop on node B (local delivery) and underlay (underlay forwarding) let dev_name_b = topol.nodes[1].port.name().to_string(); let filter_local = @@ -327,37 +393,209 @@ fn test_multicast_all_replication() -> Result<()> { // Send multicast packet from node A let payload = "all replication test"; - let send_cmd = - format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd) - .context("Failed to send multicast UDP packet")?; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; // Wait for both snoops to capture packets - let snoop_output_local = snoop_local - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for local delivery snoop")?; - let snoop_output_underlay = snoop_underlay - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for underlay snoop")?; - - // Verify local delivery happened + let snoop_output_local = + snoop_local.wait_with_timeout(Duration::from_secs(5))?; + let snoop_output_underlay = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + + // Verify same-sled local delivery (DELIVER action based on subscription) let stdout_local = String::from_utf8_lossy(&snoop_output_local.stdout); assert!( snoop_output_local.status.success() && stdout_local.contains("UDP"), - "Expected local delivery to node B, snoop output:\n{stdout_local}" + "Expected same-sled delivery to subscribed node B, snoop output:\n{stdout_local}" ); - // Verify underlay forwarding happened + // Verify egress underlay forwarding with "Both" replication flag let stdout_underlay = String::from_utf8_lossy(&snoop_output_underlay.stdout); assert!( snoop_output_underlay.status.success() && stdout_underlay.contains("UDP"), - "Expected underlay forwarding, snoop output:\n{stdout_underlay}" + "Expected egress underlay packet with 'Both' replication, snoop output:\n{stdout_underlay}" + ); + + // Parse the Geneve packet and verify the "Both" replication flag is set + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, mcast_underlay, + "Outer IPv6 dst should be multicast underlay address" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::Both), + "Geneve replication mode should be Both" + ); + + Ok(()) +} + +#[test] +fn test_partial_unsubscribe() -> Result<()> { + // Test selective unsubscribe: subscribe 3 nodes, unsubscribe 1, verify + // only the remaining 2 receive packets while forwarding state is unchanged. + let topol = + xde_tests::three_node_topology_named("omicron1", "pua", "pub", "puc")?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 6]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 6, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + let hdl = OpteHdl::open()?; + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry"); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected all 3 ports subscribed initially; got {:?}", + s_entry.ports + ); + + // Send packet and verify B and C receive (A is sender, won't receive its own) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload = "all three"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // B and C should receive (A is sender, won't see its own packet) + let snoop_b_out = snoop_b.wait_with_timeout(Duration::from_secs(5))?; + let snoop_c_out = snoop_c.wait_with_timeout(Duration::from_secs(5))?; + + assert!( + String::from_utf8_lossy(&snoop_b_out.stdout).contains("UDP"), + "Node B should receive first packet" ); + assert!( + String::from_utf8_lossy(&snoop_c_out.stdout).contains("UDP"), + "Node C should receive first packet" + ); + + // Unsubscribe node B (middle node) + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Verify subscription table now shows only A and C + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("subscription entry should still exist"); + assert!( + s_entry2.ports.contains(&p0) && s_entry2.ports.contains(&p2), + "expected p0 and p2 to remain subscribed; got {:?}", + s_entry2.ports + ); + assert!( + !s_entry2.ports.contains(&p1), + "expected p1 to be unsubscribed; got {:?}", + s_entry2.ports + ); + + // Verify forwarding table unchanged (forwarding is independent of local subs) + let fwd = hdl.dump_mcast_fwd()?; + let fwd_entry = fwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("forwarding entry should still exist"); + assert!( + fwd_entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::External + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni + }), + "forwarding table should be unchanged" + ); + + // Send another packet - only C should receive (A is sender, B unsubscribed) + let mut snoop_b2 = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c2 = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload2 = "only two"; + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload2, + )?; + + // C should receive + let snoop_c2_out = snoop_c2.wait_with_timeout(Duration::from_secs(5))?; + assert!( + String::from_utf8_lossy(&snoop_c2_out.stdout).contains("UDP"), + "Node C should receive second packet" + ); + + // B should NOT receive (timeout expected) + if let Ok(out) = snoop_b2.wait_with_timeout(Duration::from_millis(800)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("Node B should not receive after unsubscribe; got:\n{stdout}"); + } Ok(()) } diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs index f29d1697..69fa4c84 100644 --- a/xde-tests/tests/multicast_rx.rs +++ b/xde-tests/tests/multicast_rx.rs @@ -5,11 +5,18 @@ // Copyright 2025 Oxide Computer Company //! XDE multicast RX-path tests. +//! +//! These validate that: +//! - Control-plane config (M2P map + forwarding) drives TX encapsulation only. +//! - Same-sled delivery is based purely on subscriptions and is independent of +//! the Replication mode set for TX. +//! - Underlay multicast uses admin-local IPv6 (ff04::/16) and routes via the +//! host underlay interface. +//! - Packets received from the underlay are delivered to subscribed ports and +//! include the expected protocol and payload characteristics. -use anyhow::Context; use anyhow::Result; use opte_ioctl::OpteHdl; -use oxide_vpc::api::Direction; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv6Addr; @@ -33,95 +40,99 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { // M2P mapping: overlay layer needs IPv6 multicast underlay address // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address - let mcast_underlay = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 0, 0, 251, - ]); - - // Node B's underlay address - this is where we'll forward multicast packets - // From two_node_topology: node B (10.0.0.2) has underlay fd77::1 - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + let mcast_underlay: Ipv6Addr = "ff04::e000:fb".parse().unwrap(); // Set up multicast group with automatic cleanup on drop - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; - // Set up multicast forwarding with External replication for unicast delivery. - // Maps overlay IPv4 multicast group -> underlay IPv6 unicast address of node B + // Use node B's underlay address as the switch unicast address for routing. + // OPTE uses this address to determine the underlay port (via DDM routing), + // but the actual packet destination is the multicast underlay address. + // Note: This is a single-sled test; all nodes share one underlay network. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up TX forwarding with Underlay replication to test underlay RX path. + // This causes packets to be sent to the underlay multicast address, then + // received back via the underlay RX path for same-sled delivery. mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), - Replication::External, + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, )])?; + // Add IPv6 multicast route so underlay packets can be routed + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target. let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); - // Allow outbound multicast traffic through the gateway layer - topol.nodes[0].port.allow_cidr(mcast_cidr, Direction::Out)?; - topol.nodes[1].port.allow_cidr(mcast_cidr, Direction::Out)?; - - // Add router entries for multicast + // Add router entries for multicast (allows both In and Out directions) topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; // Subscribe both ports to the multicast group - topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; - topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 1 should succeed"); - // Debug: dump multicast forwarding table - println!("\n=== Multicast forwarding table ==="); + // Assert subscription state via ioctl dump before sending let hdl = OpteHdl::open()?; - let mfwd = hdl.dump_mcast_fwd()?; - for entry in &mfwd.entries { - println!( - " Group: {:?}, Next hops: {:?}", - entry.group, entry.next_hops - ); - } + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) && s_entry.ports.contains(&p1), + "expected both {p0} and {p1} to be subscribed; got {:?}", + s_entry.ports + ); + // Assert forwarding table contains expected next-hop + replication + let mfwd = hdl.dump_mcast_fwd()?; let entry = mfwd .entries .iter() - .find(|e| e.group == mcast_group.into()) - .expect("missing multicast forwarding entry for group"); + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast forwarding entry for underlay group"); assert!( - entry.next_hops.iter().any(|(nh, rep)| { - *rep == Replication::External - && nh.addr == node_b_underlay - && nh.vni == vni + entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::Underlay + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni }), - "expected External replication to {node_b_underlay:?} in forwarding table; got: {:?}", + "expected Underlay replication to {fake_switch_addr:?} in forwarding table; got: {:?}", entry.next_hops ); - // Start snoop using SnoopGuard to ensure cleanup + // Start snoop on RX side (matches IPv6 test pattern) let dev_name_b = topol.nodes[1].port.name().to_string(); let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); - let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_rx = SnoopGuard::start(&dev_name_b, &filter)?; - // Send UDP packet to the multicast address from zone A using netcat - // nc -u: IPv4 UDP mode - // -w1: timeout after 1 second + // Send UDP packet from zone A using helper (pins source for deterministic egress) let payload = "multicast test"; - let send_cmd = - format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd) - .context("Failed to send multicast UDP packet")?; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; - // Wait for snoop to capture the packet (or timeout) - let snoop_output = snoop - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for snoop to capture multicast packet")?; + // Wait for RX snoop to capture the packet (or timeout) + let snoop_rx_output = snoop_rx.wait_with_timeout(Duration::from_secs(5))?; - // Check that snoop successfully captured a packet and validate basics - let stdout = String::from_utf8_lossy(&snoop_output.stdout); + let stdout = String::from_utf8_lossy(&snoop_rx_output.stdout); assert!( - snoop_output.status.success() && !stdout.is_empty(), + snoop_rx_output.status.success() && !stdout.is_empty(), "Expected to capture multicast packet on {dev_name_b}, snoop output:\n{stdout}" ); // Protocol summary present @@ -136,10 +147,9 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { "expected destination 224.0.0.251 in snoop output:\n{stdout}" ); // Payload present - check for substring in ASCII representation - // The full payload may wrap across lines, so just check for a distinctive part assert!( - stdout.contains("ast test"), - "expected payload substring 'ast test' in ASCII portion of snoop output:\n{stdout}" + stdout.contains("test"), + "expected payload substring 'test' in ASCII portion of snoop output:\n{stdout}" ); // L2 dest: with current XDE/gateway pipeline, multicast RX to guests // is delivered with broadcast dest MAC. snoop shows 16-bit grouped hex. @@ -148,26 +158,37 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { "expected L2 broadcast MAC 'ffff ffff ffff' in snoop output; got:\n{stdout}" ); - // Unsubscribe receiver and verify no further local delivery - topol.nodes[1].port.unsubscribe_multicast(mcast_group.into())?; + // Unsubscribe receiver and verify no further same-sled delivery + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Assert subscription table reflects unsubscribe + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry after unsubscribe"); + assert!( + !s_entry2.ports.contains(&p1), + "expected {p1} to be unsubscribed; got {:?}", + s_entry2.ports + ); let mut snoop2 = SnoopGuard::start(&dev_name_b, &filter)?; - let send_cmd2 = - format!("echo '{payload}' | nc -u -w1 {mcast_group} {MCAST_PORT}"); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd2) - .context("Failed to send multicast UDP packet (post-unsubscribe)")?; - let res = snoop2.wait_with_timeout(Duration::from_millis(800)); - match res { - Ok(out) => { - let stdout = String::from_utf8_lossy(&out.stdout); - panic!( - "expected no local delivery after unsubscribe; snoop output:\n{stdout}" - ); - } - Err(_) => {} + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + if let Ok(out) = snoop2.wait_with_timeout(Duration::from_millis(800)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "expected no same-sled delivery after unsubscribe; snoop output:\n{stdout}" + ); } Ok(()) } @@ -179,46 +200,50 @@ fn test_xde_multicast_rx_ipv6() -> Result<()> { "omicron1", "rx6a", "rx6b", )?; - // IPv6 multicast group: ff05::1:3 (site-local, all-dhcp-agents) - let mcast_group = Ipv6Addr::from([ - 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x01, 0x00, 0x03, - ]); + // IPv6 multicast group: ff04::1:3 (admin-local scope) + let mcast_group: Ipv6Addr = "ff04::1:3".parse().unwrap(); const MCAST_PORT: u16 = 9999; let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - // M2P mapping: Map IPv6 multicast to admin-scoped underlay (ff04::/16) - // Per Omicron's map_external_to_underlay_ip(), convert ff05 -> ff04 - let mcast_underlay = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x01, 0x00, 0x03, - ]); - - // Node B's underlay address - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // M2P mapping: Use same admin-local address for underlay + let mcast_underlay: Ipv6Addr = "ff04::1:3".parse().unwrap(); // Set up multicast group with automatic cleanup on drop - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + // OPTE uses this address to determine the underlay port (via DDM routing), + // but the actual packet destination is the multicast underlay address. + // Note: This is a single-sled test; all nodes share one underlay network. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up multicast forwarding with External replication for local delivery + // Set up TX forwarding with Underlay replication to test underlay RX path. + // This causes packets to be sent to the underlay multicast address, then + // received back via the underlay RX path for same-sled delivery. mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), - Replication::External, + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, )])?; - // Allow IPv6 multicast traffic (ff05::/16 site-local) via Multicast target - let mcast_cidr = IpCidr::Ip6("ff05::/16".parse().unwrap()); + // Add IPv6 multicast route so underlay packets can be routed + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; + + // Allow IPv6 multicast traffic (ff04::/16 admin-local) via Multicast target + let mcast_cidr = IpCidr::Ip6("ff04::/16".parse().unwrap()); - // Add router entries for multicast + // Add router entries for multicast (allows both In and Out directions) topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; // Subscribe both ports to the multicast group - topol.nodes[0].port.subscribe_multicast(mcast_group.into())?; - topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 1 should succeed"); // Get the device names for snoop let dev_name_b = topol.nodes[1].port.name().to_string(); @@ -235,23 +260,16 @@ fn test_xde_multicast_rx_ipv6() -> Result<()> { .port .ipv6() .expect("dualstack port must have IPv6 address"); - // illumos netcat selects IPv6 based on the destination; avoid `-6` for compatibility. - let send_cmd = format!( - "echo '{payload}' | nc -u -s {sender_v6} -w1 {mcast_group} {MCAST_PORT}" - ); - topol.nodes[0] - .zone - .zone - .zexec(&send_cmd) - .context("Failed to send IPv6 multicast UDP packet")?; + topol.nodes[0].zone.send_udp_v6( + &sender_v6, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; // Wait for snoop to capture the packet (or timeout) - let snoop_output = - snoop.wait_with_timeout(Duration::from_secs(5)).context( - "Timeout waiting for snoop to capture IPv6 multicast packet", - )?; + let snoop_output = snoop.wait_with_timeout(Duration::from_secs(5))?; - // Check that snoop successfully captured a packet let stdout = String::from_utf8_lossy(&snoop_output.stdout); assert!( snoop_output.status.success() && !stdout.is_empty(), @@ -265,16 +283,11 @@ fn test_xde_multicast_rx_ipv6() -> Result<()> { fn test_reject_link_local_underlay_ff02() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let link_local_underlay = Ipv6Addr::from([ - 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 99, - ]); + let link_local_underlay: Ipv6Addr = "ff02::e001:263".parse().unwrap(); let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { group: mcast_group.into(), underlay: link_local_underlay, - vni, }); assert!( result.is_err(), @@ -288,16 +301,11 @@ fn test_reject_link_local_underlay_ff02() -> Result<()> { fn test_reject_global_underlay_ff0e() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let global_underlay = Ipv6Addr::from([ - 0xff, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 99, - ]); + let global_underlay: Ipv6Addr = "ff0e::e001:263".parse().unwrap(); let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { group: mcast_group.into(), underlay: global_underlay, - vni, }); assert!( result.is_err(), @@ -309,192 +317,58 @@ fn test_reject_global_underlay_ff0e() -> Result<()> { #[test] fn test_accept_admin_local_underlay_ff04() -> Result<()> { - let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + let admin_local: Ipv6Addr = "ff04::e001:263".parse().unwrap(); - let admin_local = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 99, - ]); - let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { - group: mcast_group.into(), - underlay: admin_local, - vni, - }); + // MulticastGroup::new calls set_m2p internally and cleans up on drop. + // This test verifies that admin-local (ff04::/16) addresses are accepted, + // in contrast to link-local (ff02::) and global (ff0e::) which are rejected. + let result = MulticastGroup::new(mcast_group.into(), admin_local); assert!( result.is_ok(), - "Expected admin-local underlay (ff04::) to be accepted" + "Expected admin-local (ff04::) underlay to be accepted" ); Ok(()) } #[test] -fn test_accept_site_local_underlay_ff05() -> Result<()> { - let hdl = OpteHdl::open()?; - let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - - let site_local = Ipv6Addr::from([ - 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 99, - ]); - let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { - group: mcast_group.into(), - underlay: site_local, - vni, - }); - assert!( - result.is_ok(), - "Expected site-local underlay (ff05::) to be accepted" - ); - - Ok(()) -} - -#[test] -fn test_accept_org_local_underlay_ff08() -> Result<()> { - let hdl = OpteHdl::open()?; - let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - - let org_local = Ipv6Addr::from([ - 0xff, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 99, - ]); - let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { - group: mcast_group.into(), - underlay: org_local, - vni, - }); - assert!( - result.is_ok(), - "Expected org-local underlay (ff08::) to be accepted" - ); - - Ok(()) -} - -#[test] -fn test_reject_wrong_vni() -> Result<()> { - let hdl = OpteHdl::open()?; - let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); - let underlay = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 100, - ]); - - let wrong_vni = Vni::new(1701u32)?; - let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { - group: mcast_group.into(), - underlay, - vni: wrong_vni, - }); - assert!( - result.is_err(), - "Expected VNI 1701 to be rejected (must use DEFAULT_MULTICAST_VNI), got: {:?}", - result - ); - - Ok(()) -} - -#[test] -fn test_accept_default_multicast_vni() -> Result<()> { - let hdl = OpteHdl::open()?; - let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); - let underlay = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 100, - ]); - - let correct_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { - group: mcast_group.into(), - underlay, - vni: correct_vni, - }); - assert!( - result.is_ok(), - "Expected DEFAULT_MULTICAST_VNI (77) to be accepted" - ); - - Ok(()) -} - -#[test] -fn test_multicast_rx_no_relay_loop() -> Result<()> { - // Test RX loop-prevention: packets arriving from underlay with - // Replication::Underlay should NOT be re-relayed back to underlay. - // This prevents infinite relay loops. +fn test_multicast_config_no_spurious_traffic() -> Result<()> { + // Test that multicast configuration (subscriptions + forwarding entries) + // doesn't spontaneously generate traffic on the underlay when no packets + // are actually being sent. let topol = xde_tests::two_node_topology_named("omicron1", "lpa", "lpb")?; let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let mcast_underlay = Ipv6Addr::from([ - 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 224, 1, 2, 200, - ]); + let mcast_underlay: Ipv6Addr = "ff04::e001:2c8".parse().unwrap(); - let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); // Set up forwarding with Underlay replication mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), + NextHopV6::new(fake_switch_addr, vni), Replication::Underlay, )])?; let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); for node in &topol.nodes { node.port.add_multicast_router_entry(mcast_cidr)?; - node.port.subscribe_multicast(mcast_group.into())?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); } - // Snoop the underlay to verify NO re-relay happens + // Snoop the underlay to verify NO spurious traffic without sending let underlay_dev = "xde_test_sim1"; let mut snoop_underlay = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; - // Simulate receiving a multicast packet FROM the underlay - // with Replication::Underlay already set (indicating it came from another host). - // Build a Geneve packet with the Underlay replication bit set. - let hdl = OpteHdl::open()?; - - // We need to inject a packet on the underlay that looks like it came from - // another host. Unfortunately, we can't easily inject raw packets in the test - // environment without significant plumbing. Instead, we verify the logic - // indirectly by checking that the dtrace probe shows the right behavior. - - // For now, document the expected behavior and add a TODO for full integration - // test once we have packet injection capability. - println!("\n=== RX Loop Prevention Test ==="); - println!("Expected behavior: Packets arriving from underlay with"); - println!("Replication::Underlay should NOT be re-relayed."); - println!("\nThis requires packet injection capability to fully test."); - println!( - "Current implementation checks incoming delivery mode in Geneve options" - ); - println!("and only relays if delivery_mode is Underlay or All."); - - // Verify the multicast forwarding table is set up correctly - let mfwd = hdl.dump_mcast_fwd()?; - println!("\n=== Multicast forwarding table ==="); - for entry in &mfwd.entries { - println!( - " Group: {:?}, Next hops: {:?}", - entry.group, entry.next_hops - ); - } - - // Since we can't inject packets easily, verify NO spurious underlay traffic - // by waiting to ensure nothing appears on underlay without us sending anything + // Verify NO spurious underlay traffic (we're not sending any packets) let snoop_result = snoop_underlay.wait_with_timeout(Duration::from_secs(2)); match snoop_result { diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs index 5d472281..ae346f12 100644 --- a/xde-tests/tests/multicast_validation.rs +++ b/xde-tests/tests/multicast_validation.rs @@ -5,8 +5,14 @@ // Copyright 2025 Oxide Computer Company //! Validation tests covering multicast operations. +//! +//! These cover control‑plane validation and idempotence: +//! - Subscribing requires an M2P map unless the group is already a ff04::/16 +//! underlay address. +//! - Subscribing with non‑multicast addresses is rejected. +//! - Double subscribe is idempotent and does not duplicate delivery. +//! - Unsubscribe is idempotent and safe when not previously subscribed. -use anyhow::Context; use anyhow::Result; use opte_ioctl::OpteHdl; use oxide_vpc::api::ClearMcast2PhysReq; @@ -22,20 +28,70 @@ use std::time::Duration; use xde_tests::MulticastGroup; use xde_tests::SnoopGuard; +#[test] +fn test_subscribe_without_m2p_mapping() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "nm2pa", "nm2pb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let res = topol.nodes[0].port.subscribe_multicast(mcast_group.into()); + + assert!( + res.is_err(), + "Expected error when subscribing without M2P mapping, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_ff04_direct_without_m2p() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "ff04a", "ff04b")?; + + // IPv6 admin-scoped multicast (ff04::/16) - already an underlay address + let underlay_mcast = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ]); + + let res = topol.nodes[0].port.subscribe_multicast(underlay_mcast.into()); + + assert!( + res.is_ok(), + "Expected ff04::/16 subscription to succeed without M2P, got error: {res:?}" + ); + + // Assert subscription present + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay_mcast) + .expect("missing multicast subscription entry for ff04 group"); + let p0 = topol.nodes[0].port.name().to_string(); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + + Ok(()) +} + #[test] fn test_subscribe_nonexistent_port() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); - // Try to subscribe non-existent port - let result = hdl.mcast_subscribe(&McastSubscribeReq { + let res = hdl.mcast_subscribe(&McastSubscribeReq { port_name: "this_port_does_not_exist_anywhere".to_string(), group: mcast_group.into(), }); - // Should return error, not panic or succeed assert!( - result.is_err(), + res.is_err(), "Expected error when subscribing non-existent port, got Ok" ); @@ -47,25 +103,17 @@ fn test_subscribe_unicast_ip_as_group() -> Result<()> { let topol = xde_tests::two_node_topology_named("omicron1", "unia", "unib")?; let hdl = OpteHdl::open()?; - // Try to subscribe to unicast IP (not multicast) - should be rejected let unicast_ip = Ipv4Addr::from([10, 0, 0, 1]); - let result = hdl.mcast_subscribe(&McastSubscribeReq { + let res = hdl.mcast_subscribe(&McastSubscribeReq { port_name: topol.nodes[0].port.name().to_string(), group: unicast_ip.into(), }); - // Should reject non-multicast addresses - match result { - Ok(_) => { - panic!("Expected error when subscribing to unicast IP, got Ok") - } - Err(e) => { - assert!( - format!("{e:?}").contains("not a multicast address"), - "Expected 'not a multicast address' error, got: {e:?}", - ); - } - } + let err = res.expect_err("Expected error when subscribing to unicast IP"); + assert!( + format!("{err:?}").contains("not a multicast address"), + "Expected 'not a multicast address' error, got: {err:?}", + ); Ok(()) } @@ -82,15 +130,13 @@ fn test_double_subscribe() -> Result<()> { 224, 1, 2, 101, ]); - let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), + NextHopV6::new(fake_switch_addr, vni), Replication::External, )])?; @@ -99,40 +145,51 @@ fn test_double_subscribe() -> Result<()> { node.port.add_multicast_router_entry(mcast_cidr)?; } - // Subscribe once - topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("first subscribe should succeed"); + + let res = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); - // Subscribe again (should be idempotent) - let result = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); + assert!( + res.is_ok(), + "Double subscribe should be idempotent, got error: {res:?}" + ); - // Should succeed (idempotent operation) + let subs = OpteHdl::open()?.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + let p1 = topol.nodes[1].port.name().to_string(); assert!( - result.is_ok(), - "Double subscribe should be idempotent, got error: {:?}", - result + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports ); - // Verify delivery works and packet is NOT duplicated let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); let mut snoop = SnoopGuard::start(topol.nodes[1].port.name(), &filter)?; - topol.nodes[0].zone.zone.zexec(&format!( - "echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}" - ))?; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + "test", + )?; - let output = snoop - .wait_with_timeout(Duration::from_secs(5)) - .context("Timeout waiting for multicast delivery")?; + let output = snoop.wait_with_timeout(Duration::from_secs(5))?; let stdout = String::from_utf8_lossy(&output.stdout); - // Verify packet received assert!( output.status.success() && stdout.contains("UDP"), "Should receive multicast after double subscribe:\n{stdout}" ); - // Count occurrences - should be 1, not 2 (no duplication) let count = stdout.matches("UDP").count(); assert!( count == 1, @@ -148,17 +205,12 @@ fn test_unsubscribe_never_subscribed() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 102]); - // Try to unsubscribe without ever subscribing - let result = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + let res = hdl.mcast_unsubscribe(&McastUnsubscribeReq { port_name: topol.nodes[0].port.name().to_string(), group: mcast_group.into(), }); - // Expected: Ok (no-op). Unsubscribe is idempotent for existing ports. - assert!( - result.is_ok(), - "Unsubscribe should be a no-op (Ok), got: {result:?}" - ); + assert!(res.is_ok(), "Unsubscribe should be a no-op (Ok), got: {res:?}"); Ok(()) } @@ -175,15 +227,13 @@ fn test_subscribe_then_clear_m2p() -> Result<()> { 224, 1, 2, 103, ]); - let mcast = MulticastGroup::new(mcast_group.into(), underlay, vni)?; + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; - let node_b_underlay = Ipv6Addr::from([ - 0xfd, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - ]); + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); mcast.set_forwarding(vec![( - NextHopV6::new(node_b_underlay, vni), + NextHopV6::new(fake_switch_addr, vni), Replication::External, )])?; @@ -192,17 +242,15 @@ fn test_subscribe_then_clear_m2p() -> Result<()> { node.port.add_multicast_router_entry(mcast_cidr)?; } - topol.nodes[1].port.subscribe_multicast(mcast_group.into())?; + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); - // Clear M2P while subscription active let hdl = OpteHdl::open()?; - hdl.clear_m2p(&ClearMcast2PhysReq { - group: mcast_group.into(), - underlay, - vni, - })?; + hdl.clear_m2p(&ClearMcast2PhysReq { group: mcast_group.into(), underlay }) + .expect("clear_m2p should succeed"); - // Start snoops to verify NO delivery occurs after M2P clear let dev_name_b = topol.nodes[1].port.name().to_string(); let filter_local = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); @@ -212,22 +260,21 @@ fn test_subscribe_then_clear_m2p() -> Result<()> { let mut snoop_underlay = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; - // Send packet - command should execute successfully regardless of delivery - let result = topol.nodes[0] - .zone - .zone - .zexec(&format!("echo 'test' | nc -u -w1 {mcast_group} {MCAST_PORT}")); + let sender_v4 = topol.nodes[0].port.ip(); + let res = topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + "test", + ); - // Expected: Ok (command executed). Delivery should NOT occur. - assert!(result.is_ok(), "Send after M2P clear should succeed: {result:?}"); + assert!(res.is_ok(), "Send after M2P clear should succeed: {res:?}"); - // Verify no local delivery if let Ok(out) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { let stdout = String::from_utf8_lossy(&out.stdout); panic!("No local delivery expected; got:\n{stdout}"); } - // Verify no underlay forwarding (encap denied without M2P) if let Ok(out) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) { let stdout = String::from_utf8_lossy(&out.stdout); panic!( @@ -237,3 +284,215 @@ fn test_subscribe_then_clear_m2p() -> Result<()> { Ok(()) } + +#[test] +fn test_set_mcast_fwd_rejects_non_default_vni() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "vnix", "vniy")?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 200, + ]); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a non-default VNI and multicast next-hop address checks separately + let bad_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI + 1)?; + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(fake_switch_addr, bad_vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject non-default VNI"); + Ok(()) +} + +#[test] +fn test_set_mcast_fwd_rejects_multicast_next_hop() -> Result<()> { + let _topol = + xde_tests::two_node_topology_named("omicron1", "mnhx", "mnhy")?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 201]); + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 201, + ]); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a multicast address for next-hop (invalid) + let bad_next_hop: Ipv6Addr = "ff04::1".parse().unwrap(); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(bad_next_hop, vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject multicast next-hop"); + Ok(()) +} + +#[test] +fn test_unsubscribe_ipv6_non_underlay_scopes() -> Result<()> { + let topol = xde_tests::two_node_topology_dualstack_named( + "omicron1", "unsv6a", "unsv6b", + )?; + let hdl = OpteHdl::open()?; + + // ff02::/16 (link-local) and ff0e::/16 (global) are rejected by set_m2p, + // so no M2P mapping can exist for these scopes. Unsubscribe should be + // idempotent and return Ok. + + let link_local: Ipv6Addr = "ff02::1:3".parse().unwrap(); + let global: Ipv6Addr = "ff0e::1:3".parse().unwrap(); + + let res_ff02 = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: link_local.into(), + }); + + assert!( + res_ff02.is_ok(), + "Unsubscribe ff02:: should be idempotent (Ok), got: {res_ff02:?}" + ); + + let res_ff0e = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: global.into(), + }); + + assert!( + res_ff0e.is_ok(), + "Unsubscribe ff0e:: should be idempotent (Ok), got: {res_ff0e:?}" + ); + + Ok(()) +} + +#[test] +fn test_multiple_nexthops_accumulate() -> Result<()> { + // Test that set_forwarding accumulates next-hops like `swadm route add`: + // - Same underlay + different next-hop → add + // - Same underlay + same next-hop → replace replication mode + let topol = xde_tests::two_node_topology_named("omicron1", "mnha", "mnhb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 104]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 104, + ]); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + let switch_a = topol.nodes[0].port.underlay_ip().into(); + let switch_b = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::External, + )])?; + + let hdl = OpteHdl::open()?; + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!(entry.next_hops.len(), 1, "Expected 1 next-hop after first set"); + assert_eq!(entry.next_hops[0].0.addr, switch_a); + assert_eq!(entry.next_hops[0].1, Replication::External); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_b, vni), + Replication::Underlay, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next-hops after second set" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::External, + "switch_a should have External" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should have Underlay" + ); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::Both, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next-hops after updating switch_a" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::Both, + "switch_a should now have Both (updated)" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should still have Underlay" + ); + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 43e95e9e..cabc154d 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -11,13 +11,13 @@ use alloc::collections::btree_map::Entry; use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; +use alloc::vec::Vec; use opte::api::IpAddr; use opte::api::MacAddr; use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; -use opte::ddi::sync::KRwLockWriteGuard; /// A map/set lookup key for ports indexed on `(Vni, MacAddr)`. /// @@ -39,6 +39,11 @@ impl VniMac { } } +/// Shared ownership of an XDE port. +/// +/// Using `Arc` ensures that ports remain live as long as any +/// `DevMap` snapshot references them, even if the port is removed from +/// the canonical map. This prevents use-after-free in concurrent delivery paths. type Dev = Arc; /// `BTreeMap`-accelerated lookup of XDE ports. @@ -47,6 +52,11 @@ type Dev = Arc; /// pair. The former is used mostly by the control plane, and the latter by the /// data plane -- thus, querying by address provides a direct lookup. Any other /// lookups (e.g., multicast listeners) should return `FastKey`s or `&[FastKey]`s. +/// +/// Multicast subscriptions in `mcast_groups` are port-local and sled-local: +/// ports subscribe to underlay IPv6 multicast groups (ff04::/16) to receive +/// packets for overlay multicast groups. Subscriptions are independent of the +/// forwarding table and are automatically cleaned up when ports are removed. #[derive(Clone)] pub struct DevMap { devs: BTreeMap, @@ -93,7 +103,9 @@ impl DevMap { /// Allow a port to receive on a given multicast group. /// - /// This takes the overlay (outer v6) multicast group address. + /// This takes the underlay IPv6 multicast group address (ff04::/16). + /// Callers at the ioctl boundary may pass an overlay group; the handler + /// translates overlay→underlay via the M2P table before calling here. pub fn mcast_subscribe( &mut self, name: &str, @@ -102,8 +114,7 @@ impl DevMap { // Validate that the IP is actually a multicast address if !mcast_ip.is_multicast() { return Err(OpteError::BadState(format!( - "IP address {} is not a multicast address", - mcast_ip + "IP address {mcast_ip} is not a multicast address" ))); } @@ -112,7 +123,6 @@ impl DevMap { .ok_or_else(|| OpteError::PortNotFound(name.into()))?; let key = get_key(port); - // TODO: probably could store Arcs or Weaks here, but want to be safe for now. self.mcast_groups.entry(mcast_ip).or_default().insert(key); Ok(()) @@ -129,7 +139,6 @@ impl DevMap { .ok_or_else(|| OpteError::PortNotFound(name.into()))?; let key = get_key(port); - // TODO: Do we need handling for a special VNI from rack-external traffic? if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_ip) { set.into_mut().remove(&key); } @@ -145,6 +154,12 @@ impl DevMap { self.mcast_groups.get(mcast_ip).map(|v| v.iter()) } + /// Returns true if any multicast subscribers exist on this sled. + #[inline] + pub fn has_mcast_subscribers(&self) -> bool { + !self.mcast_groups.is_empty() + } + /// Return a reference to an `XdeDev` using its address. #[inline] #[must_use] @@ -178,6 +193,12 @@ impl DevMap { /// them to a matching XDE port. /// /// Any chains without a matching port are dropped. + /// + /// Safety: This is safe to call even if ports are being concurrently + /// removed from the canonical `DevMap`, because callers hold an + /// `Arc` which contains `Arc` entries. The Arc reference + /// chain ensures all ports in this snapshot remain live for the duration of + /// delivery. #[inline] pub fn deliver_all(&self, postbox: Postbox) { for (k, v) in postbox.drain() { @@ -186,6 +207,20 @@ impl DevMap { } } } + + /// Dump all multicast subscriptions as a vector of (group, ports) pairs. + pub fn dump_mcast_subscriptions(&self) -> Vec<(IpAddr, Vec)> { + let mut out = Vec::new(); + for (group, subs) in self.mcast_groups.iter() { + let ports: Vec = subs + .iter() + .filter_map(|vm| self.devs.get(vm)) + .map(|d| d.devname.clone()) + .collect(); + out.push((group.clone(), ports)); + } + out + } } #[inline(always)] @@ -211,8 +246,4 @@ impl ReadOnlyDevMap { pub fn read(&self) -> KRwLockReadGuard<'_, DevMap> { self.0.read() } - - pub fn write(&self) -> KRwLockWriteGuard<'_, DevMap> { - self.0.write() - } } diff --git a/xde/src/postbox.rs b/xde/src/postbox.rs index fa011d89..ec142a92 100644 --- a/xde/src/postbox.rs +++ b/xde/src/postbox.rs @@ -62,6 +62,12 @@ impl Postbox { pub fn drain(self) -> impl Iterator { self.boxes.into_iter() } + + /// Returns true if there are no queued deliveries. + #[inline] + pub fn is_empty(&self) -> bool { + matches!(self.boxes, Boxes::None) + } } // SAFETY: The only `!Send`/`!Sync` element in here is the `NonNull<...>`. diff --git a/xde/src/stats.rs b/xde/src/stats.rs index 02518ab8..a7a1d498 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -55,27 +55,31 @@ pub struct XdeStats { out_drop_misc: KStatU64, // NOTE: tun_opt is not relevant to outbound packets -- no encapsulation // is in use. - /// The number of multicast packets delivered to external/customer - /// members (decapsulated packets to local guest instances). - mcast_tx_external: KStatU64, - /// The number of multicast packets forwarded to underlay/infrastructure - /// members (encapsulated Geneve packets to infrastructure destinations). + /// The number of multicast packets delivered to local guest instances + /// on this sled (cloned packets to same-sled OPTE ports via guest_loopback). + mcast_tx_local: KStatU64, + /// The number of multicast packets forwarded to underlay multicast group + /// (encapsulated Geneve packets to other sleds). mcast_tx_underlay: KStatU64, + /// The number of multicast packets forwarded for external replication + /// (unicast to boundary service for front panel egress). + mcast_tx_external: KStatU64, /// The number of times a stale multicast listener was encountered - /// during external delivery. - mcast_tx_stale_external: KStatU64, + /// during local same-sled delivery (TX path). + mcast_tx_stale_local: KStatU64, + /// The number of multicast packets sent with no forwarding entry + /// in the mcast_fwd table (TX path). + mcast_tx_no_fwd_entry: KStatU64, - /// The number of multicast packets received and delivered to external/customer - /// members (decapsulated packets to local guest instances). - mcast_rx_external: KStatU64, - /// The number of multicast packets received and forwarded to underlay/infrastructure - /// members (re-encapsulated Geneve packets to infrastructure destinations). - mcast_rx_underlay: KStatU64, + /// The number of multicast packets received and delivered to local guest + /// instances on this sled (decapsulated packets to same-sled OPTE ports). + mcast_rx_local: KStatU64, /// The number of times a stale multicast listener was encountered - /// during Rx external delivery. - mcast_rx_stale_external: KStatU64, - /// The number of multicast packets received with no forwarding entry. - mcast_rx_no_fwd_entry: KStatU64, + /// during local same-sled delivery (RX path). + mcast_rx_stale_local: KStatU64, + /// The number of multicast packets received with no local subscribers + /// (no matching same-sled listeners for the multicast group). + mcast_rx_no_subscribers: KStatU64, /// The number of times a pullup operation failed during multicast TX /// (packet replication), causing a packet to be dropped. mcast_tx_pullup_fail: KStatU64, @@ -85,32 +89,36 @@ pub struct XdeStats { } impl XdeStats { - pub fn mcast_tx_external(&self) -> &KStatU64 { - &self.mcast_tx_external + pub fn mcast_tx_local(&self) -> &KStatU64 { + &self.mcast_tx_local } pub fn mcast_tx_underlay(&self) -> &KStatU64 { &self.mcast_tx_underlay } - pub fn mcast_tx_stale_external(&self) -> &KStatU64 { - &self.mcast_tx_stale_external + pub fn mcast_tx_external(&self) -> &KStatU64 { + &self.mcast_tx_external + } + + pub fn mcast_tx_stale_local(&self) -> &KStatU64 { + &self.mcast_tx_stale_local } - pub fn mcast_rx_external(&self) -> &KStatU64 { - &self.mcast_rx_external + pub fn mcast_tx_no_fwd_entry(&self) -> &KStatU64 { + &self.mcast_tx_no_fwd_entry } - pub fn mcast_rx_underlay(&self) -> &KStatU64 { - &self.mcast_rx_underlay + pub fn mcast_rx_local(&self) -> &KStatU64 { + &self.mcast_rx_local } - pub fn mcast_rx_stale_external(&self) -> &KStatU64 { - &self.mcast_rx_stale_external + pub fn mcast_rx_stale_local(&self) -> &KStatU64 { + &self.mcast_rx_stale_local } - pub fn mcast_rx_no_fwd_entry(&self) -> &KStatU64 { - &self.mcast_rx_no_fwd_entry + pub fn mcast_rx_no_subscribers(&self) -> &KStatU64 { + &self.mcast_rx_no_subscribers } pub fn mcast_tx_pullup_fail(&self) -> &KStatU64 { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 7267e581..fab0c957 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -56,66 +56,104 @@ //! `TokenLock` to control write access. //! //! Once we have a port, things become fairly simple. Today, each port has a -//! central RWLock -- reads/writes are only held for the duration of packet +//! central RWLock, as reads/writes are only held for the duration of packet //! processing, or as long as is required to insert new rules. //! -//! ### `DevMap` views +//! ### [`DevMap`] views //! Ideally, we want the above interactions to have minimal impact on one another //! (e.g., insertion of a port should not lock out all use of the datapath). //! For this reason, we provide the datapath entrypoints with read-only shared -//! copies of the central `DevMap`. +//! copies of the central [`DevMap`]. //! * For Rx entrypoints, we allocate a `Vec>>`. Each CPU //! on the system has its own slot within this `Vec`, such that there should //! never be lock contention unless a port is being added/removed. The CPU ID -//! is then used as an index into this table, and the lock is held until all -//! packets are delivered (as all packet deliveries require a live `XdeDev`). -//! * For Tx entrypoints, each `XdeDev` holds an RWLock around its copy of the -//! `DevMap`. When needed for delivery, the Rx pathway acquires the read lock. -//! We prefer an RwLock here over a Mutex[] given that we can be called from -//! multiple threads, and our callers are not expected to bound to a given CPU. -//! Most packet deliveries should go via the underlay. +//! is then used as an index into this table, the Arc is cloned, and the lock +//! is dropped immediately. This makes readers lock-free and avoids blocking +//! management refreshes. +//! - Safety: The cloned `Arc` keeps all [`XdeDev`]s in that snapshot +//! alive ([`DevMap`] contains `Arc` entries), ensuring that delivery +//! via [`deliver_all()`](DevMap::deliver_all) always operates on live ports. +//! Physical mutex is held only during Arc clone (single atomic increment), +//! then dropped. +//! * For Tx entrypoints, each `XdeDev` holds a per-port `KRwLock>`. +//! We prefer an RwLock here over a Mutex given that we can be called from +//! multiple threads, and our callers are not expected to be bound to a given +//! CPU. +//! - Unicast to remote host: No `DevMap` needed, packets go directly to +//! underlay. +//! - Hairpin (same-host unicast): Lazily clone per-port `DevMap` Arc for +//! local delivery. +//! - Multicast: Clone per-CPU `mcast_fwd` Arc once at start. Lazily clone +//! per-port `DevMap` Arc only if local subscribers exist. //! -//! Holding the lock in both cases (rather than cloning out the `Arc`) has an -//! inherent risk associated, but this is necessary to ensure that no Rx/Tx -//! contexts will attempt to send a packet to a port which has been (or is being!) -//! removed. Holding a read/lock on the `DevMap` in use ensures that any found -//! port remains alive until any in-progress packet processing is complete. +//! Cloning the Arc (rather than holding read/lock guards) eliminates re-entrant +//! read deadlock risk and avoids blocking management operations for the duration +//! of packet chains. The cloned Arc ensures that no Rx/Tx contexts will attempt +//! to send a packet to a port which has been (or is being) removed -- holding +//! the Arc keeps the [`DevMap`] snapshot alive until packet processing is complete. +//! Since [`DevMap`] contains `Arc` entries, the Arc reference chain +//! guarantees all ports in the snapshot remain live throughout delivery (e.g., +//! [`deliver_all()`](DevMap::deliver_all)), preventing use-after-free even if +//! ports are concurrently removed from the canonical mapping. //! //! In the Rx case, loopback delivery or MAC->CPU oversubscription present some //! risk of contention. These are not expected paths in the product, but using //! them does not impact correctness. //! -//! The remaining locking risks are double-locking a given Rx Mutex by the same -//! thread, and re-entrant reads on a Tx RwLock without readers-starve-writers -//! configured. The first such case results in a panic, but can only happen if -//! we transit the NIC's Rx path twice in the same stack (i.e. Rx on NIC -> -//! mac_rx on the OPTE port -> ... -> loopback delivery to underlay device). -//! This should be impossible, given that any packet sent upstack by XDE must -//! have a MAC address belonging to the OPTE port. +//! The remaining locking risk is double-locking a given Rx Mutex by the same +//! thread during the brief Arc clone operation. This results in a panic, but can +//! only happen if we transit the NIC's Rx path twice in the same stack (i.e. +//! Rx on NIC -> mac_rx on the OPTE port -> ... -> loopback delivery to underlay +//! device). This should be impossible, given that any packet sent upstack by XDE +//! must have a MAC address belonging to the OPTE port. //! -//! The second exposes us to a deadlock if the ordering `read[xde_mc_tx] -> -//! write[ioctl] -> read[xde_mc_tx]` occurs on one lock -- the latter read -//! acquisition will block indefinitely. This is a possibility we need to -//! consciously work around. Hairpin exchanges (e.g., ARP -> ICMP ping, DHCP) -//! can lead to fairly deep stacks of the form `(ip) -> xde_mc_tx -> (ip) -> -//! xde_mc_tx -> ...` when used with zones (this is not an issue with viona, -//! which returns once packets are communicated to the guest). Thus, we *must* -//! drop the read before delivering any hairpin packets. +//! The previous re-entrant read deadlock risk (`read[xde_mc_tx] -> write[ioctl] +//! -> read[xde_mc_tx]`) has been eliminated by using Arc clones instead of held +//! read guards. Once the Arc is cloned and the lock is dropped, subsequent +//! re-entries will acquire a fresh lock without conflict. Hairpin exchanges +//! (e.g., ARP -> ICMP ping, DHCP) can safely create deep stacks of the form +//! `(ip) -> xde_mc_tx -> (ip) -> xde_mc_tx -> ...` when used with zones. //! -//! ### `TokenLock` and `DevMap` updates +//! Note: +//! - We cannot afford to take the management lock (`TokenLock`) during any +//! dataplane operation. If a dataplane path ever needs to consult the +//! central source of truth directly, the minimally acceptable pattern is a +//! read of `state.devs.read()` (never the management token itself). In +//! practice, to further reduce contention on readers counters we avoid even +//! this by using per-CPU cached `Arc` snapshots for both RX and TX. +//! - Multicast forwarding state (`mcast_fwd`) follows the same model: a copy +//! is kept in each [`PerEntryState`] (per-CPU) and updated by `refresh_maps()` +//! whenever the canonical forwarding table changes. This ensures RX/TX always +//! observe a coherent snapshot without taking the management lock. We do not +//! maintain per-port copies (those were removed to avoid per-port RwLock +//! contention issues). +//! +//! ### `TokenLock` and [`DevMap`] updates //! The `TokenLock` primitive provides us with logical mutual exclusion around -//! the underlay and the ability to modify the canonical `DevMap` -- without +//! the underlay and the ability to modify the canonical [`DevMap`] -- without //! holding a `KMutex`. Management operations made by OPTE *will* upcall -- we //! must resolve link names to IDs, and add/remove link information from DLS. //! Doing so makes an ioctl thread vulnerable to receiving signals, so other //! threads trying to take the management lock must be able to take, e.g., //! a SIGSTOP. //! -//! Whenever the central `DevMap` is modified, we iterate through each reachable -//! `XdeDev` and underlay port, and for every instance of the cloned `DevMap` we -//! write()/lock() that entry, replace it with the new contents, and drop the -//! lock. This ensures that port removal cannot fully proceed until the port is -//! no longer usable from any Tx/Rx context. +//! Whenever the central [`DevMap`] is modified, we iterate through each reachable +//! [`XdeDev`] and underlay port, and for every instance of the cloned [`DevMap`] +//! and `mcast_fwd` we write()/lock() that entry, replace it with the new +//! contents, and drop the lock. This ensures that port removal cannot fully +//! proceed until the port is no longer usable from any Tx/Rx context and that +//! multicast delivery and forwarding use the matching snapshot. +//! +//! ### Teardown and reference cycles +//! The Arc-cloning strategy creates a reference cycle during normal operation: +//! underlay port → stream → ports_map (per-CPU) → [`DevMap`] → [`XdeDev`] → underlay port. +//! This is benign during operation but must be broken during teardown. +//! +//! When `clear_xde_underlay()` is called (after all ports have been removed), +//! we explicitly clear per-CPU cached `DevMap`s by replacing them with empty +//! snapshots. This breaks the cycle and allows underlay port Arcs to be unwrapped. +//! If brief in-flight TX chains still hold `DevMap` references, the unwrap returns +//! EBUSY and the caller can retry. Refs drain quickly once caches are cleared. use crate::dev_map::DevMap; use crate::dev_map::ReadOnlyDevMap; @@ -166,6 +204,8 @@ use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; use core::ptr::addr_of_mut; +use core::sync::atomic::AtomicBool; +use core::sync::atomic::Ordering; use core::time::Duration; use illumos_sys_hdrs::mac::MacEtherOffloadFlags; use illumos_sys_hdrs::mac::MblkOffloadFlags; @@ -203,18 +243,22 @@ use opte::ddi::mblk::MsgBlk; use opte::ddi::mblk::MsgBlkChain; use opte::ddi::sync::KMutex; use opte::ddi::sync::KRwLock; -use opte::ddi::sync::KRwLockReadGuard; use opte::ddi::sync::KRwLockWriteGuard; use opte::ddi::sync::TokenGuard; use opte::ddi::sync::TokenLock; +use opte::ddi::sync::clone_from_mutex; +use opte::ddi::sync::clone_from_rwlock; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; use opte::engine::NetworkImpl; +use opte::engine::ether::EtherAddr; use opte::engine::ether::Ethernet; use opte::engine::ether::EthernetRef; use opte::engine::geneve::Vni; use opte::engine::geneve::WalkOptions; use opte::engine::headers::IpAddr; +use opte::engine::ip::ValidL3; +use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v6::Ipv6Addr; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::packet::InnerFlowId; @@ -224,6 +268,7 @@ use opte::engine::parse::ValidUlp; use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::ClearMcast2PhysReq; @@ -237,11 +282,13 @@ use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::ListPortsResp; use oxide_vpc::api::McastForwardingEntry; use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastSubscriptionEntry; use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; @@ -268,6 +315,11 @@ use oxide_vpc::engine::router; const ETHERNET_MTU: u16 = 1500; +// Type alias for multicast forwarding table: +// Maps IPv6 destination addresses to their next-hop replication entries. +type McastForwardingTable = + BTreeMap>; + // Entry limits for the various flow tables. const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); const FT_LIMIT_ONE: NonZeroU32 = NonZeroU32::new(1).unwrap(); @@ -305,13 +357,11 @@ unsafe extern "C" { af: uintptr_t, // AF_INET or AF_INET6 inner_dst: uintptr_t, // *const Ipv4Addr or *const Ipv6Addr vni: uintptr_t, - replication: uintptr_t, ); pub safe fn __dtrace_probe_mcast__rx( af: uintptr_t, inner_dst: uintptr_t, vni: uintptr_t, - replication: uintptr_t, ); pub safe fn __dtrace_probe_mcast__local__delivery( af: uintptr_t, @@ -325,6 +375,55 @@ unsafe extern "C" { vni: uintptr_t, next_hop: *const oxide_vpc::api::Ipv6Addr, ); + pub safe fn __dtrace_probe_mcast__external__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); + + // Multicast control-plane probes + pub safe fn __dtrace_probe_mcast__map__set( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__map__clear( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__set( + underlay: *const oxide_vpc::api::Ipv6Addr, + count: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__clear( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__subscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__unsubscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + + // Multicast dataplane problem probes + pub safe fn __dtrace_probe_mcast__tx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__rx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__no__fwd__entry( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); } fn bad_packet_parse_probe( @@ -401,6 +500,7 @@ struct XdeState { management_lock: TokenLock, ectx: Arc, vpc_map: Arc, + m2p: Arc, v2b: Arc, devs: ReadOnlyDevMap, stats: KStatNamed, @@ -417,10 +517,10 @@ struct XdeState { struct XdeMgmt { devs: Arc>, underlay: Option, - /// XDE-wide multicast forwarding table mapping multicast group addresses + /// XDE-wide multicast forwarding table mapping underlay multicast addresses /// to their physical next hops with replication information. - /// Maps: IpAddr (overlay multicast group) -> BTreeMap - mcast_fwd: Arc>>>, + /// Maps: Ipv6Addr (underlay multicast address) -> BTreeMap + mcast_fwd: Arc>, } #[derive(Clone)] @@ -444,7 +544,6 @@ fn get_xde_state() -> &'static XdeState { impl XdeState { fn new() -> Self { - #[allow(clippy::arc_with_non_send_sync)] let ectx = Arc::new(ExecCtx { log: Box::new(opte::KernelLog {}) }); let dev_map = Arc::new(KRwLock::new(DevMap::default())); let devs = ReadOnlyDevMap::new(dev_map.clone()); @@ -458,6 +557,7 @@ impl XdeState { devs, ectx, vpc_map: Arc::new(overlay::VpcMappings::new()), + m2p: Arc::new(overlay::Mcast2Phys::new()), v2b: Arc::new(overlay::Virt2Boundary::new()), stats: KStatNamed::new("xde", "xde", XdeStats::new()) .expect("Name is well-constructed (len, no NUL bytes)"), @@ -490,7 +590,7 @@ pub struct XdeDev { // However, that's not where things are today. pub port: Arc>, vpc_cfg: VpcCfg, - port_vni_state: Arc, + port_v2p: Arc, // Pass the packets through to the underlay devices, skipping // opte-core processing. @@ -547,7 +647,12 @@ pub enum UnderlayIndex { #[repr(C)] struct PerEntryState { devs: KMutex>, - _pad: [u8; 48], + mcast_fwd: KRwLock>, + /// Fast-path check: `true` if any multicast subscribers exist on this sled. + /// Allows skipping DevMap lock entirely for multicast when no local listeners. + /// Updated by refresh_maps() on port add/remove. + has_mcast_subscribers: AtomicBool, + _pad: [u8; 31], } const _: () = assert!( @@ -557,7 +662,12 @@ const _: () = assert!( impl Default for PerEntryState { fn default() -> Self { - Self { devs: KMutex::new(Arc::new(DevMap::new())), _pad: [0u8; 48] } + Self { + devs: KMutex::new(Arc::new(DevMap::new())), + mcast_fwd: KRwLock::new(Arc::new(BTreeMap::new())), + has_mcast_subscribers: AtomicBool::new(false), + _pad: [0u8; 31], + } } } @@ -930,6 +1040,11 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { hdlr_resp(&mut env, resp) } + OpteCmd::DumpMcastSubscriptions => { + let resp = dump_mcast_subscriptions_hdlr(); + hdlr_resp(&mut env, resp) + } + OpteCmd::McastSubscribe => { let resp = mcast_subscribe_hdlr(&mut env); hdlr_resp(&mut env, resp) @@ -1037,12 +1152,13 @@ fn create_xde(req: &CreateXdeReq) -> Result { req.xde_devname.clone(), &cfg, state.vpc_map.clone(), + state.m2p.clone(), port_v2p.clone(), state.v2b.clone(), state.ectx.clone(), &req.dhcp, )?, - port_vni_state: port_v2p, + port_v2p, vni: cfg.vni, vpc_cfg: cfg, passthrough: req.passthrough, @@ -1132,6 +1248,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { token.underlay.as_ref().expect( "bailed out above if no underlay, and protected by token", ), + &token.mcast_fwd, ); } @@ -1158,15 +1275,20 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); xde }; - // Clear the port's devmap to break any cycles. + // Break potential self-reference cycles before dropping this `XdeDev` by + // resetting its per-port `DevMap` snapshot to an empty map. Otherwise, the + // `Arc` inside `port_map` may still contain an Arc back to this + // same XdeDev, keeping it (and its underlay Arc clones) alive beyond + // deletion. { - let mut pmap = xde.port_map.write(); - *pmap = Default::default(); + let mut port_map = xde.port_map.write(); + *port_map = Arc::new(DevMap::new()); } let return_port = |token: &TokenGuard<'_, XdeMgmt>, port| { @@ -1178,6 +1300,7 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); }; @@ -1240,24 +1363,46 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { Ok(NoResp::default()) } -/// Rebuild each entrypoint's view of the central `DevMap`. -fn refresh_maps(devs: KRwLockWriteGuard, underlay: &UnderlayState) { +/// Rebuild each entrypoint's view of the central [`DevMap`] and multicast +/// forwarding table `McastForwardingTable`. +fn refresh_maps( + devs: KRwLockWriteGuard, + underlay: &UnderlayState, + mcast_fwd: &Arc>, +) { let new_map = Arc::new(devs.clone()); + let new_mcast_fwd = Arc::new(mcast_fwd.read().clone()); + let has_subscribers = new_map.has_mcast_subscribers(); + + // Update both underlay ports' per-CPU caches (u1 and u2). + // Each underlay port has a Vec with one entry per CPU. + let underlay_ports = + [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; + for per_cpu_map in underlay_ports { + for entry in per_cpu_map { + { + let mut map = entry.devs.lock(); + *map = Arc::clone(&new_map); + } + { + let mut mcast = entry.mcast_fwd.write(); + *mcast = Arc::clone(&new_mcast_fwd); + } + // Update fast-path flag for multicast optimization. + // Relaxed ordering is fine: stale reads are safe. If a CPU sees + // stale `false`, it skips obtaining DevMap entirely (no subscribers). + // If it sees stale `true`, it clones DevMap Arc and checks. + entry + .has_mcast_subscribers + .store(has_subscribers, Ordering::Relaxed); + } + } - // Update all ports' maps. - for port in devs.iter() { + // Update all ports' per-port maps. + for port in new_map.iter() { let mut map = port.port_map.write(); *map = Arc::clone(&new_map); } - - // Update all underlays' maps. - let ports = [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; - for port in ports { - for map in port { - let mut map = map.devs.lock(); - *map = Arc::clone(&new_map); - } - } } struct ResolvedLink<'a>(&'a str, LinkId); @@ -1320,50 +1465,107 @@ fn clear_xde_underlay() -> Result { // Clear multicast forwarding table to release any references token.mcast_fwd.write().clear(); - if let Some(underlay) = token.underlay.take() { - // If the underlay references have leaked/spread beyond `XdeDev`s and not - // been cleaned up, we committed have a fatal programming error. - // We aren't using `Weak` references to these types either, so no strong - // references could be created. - // - // We know these must succeed given that the only holders of an - // `Arc` are `XdeState` (whose ref we have exclusively locked) - // and `XdeDev` (of which none remain). - let name = underlay.u1.name.clone(); - let u1 = Arc::into_inner(underlay.u1).unwrap_or_else(|| { - panic!("underlay u1 ({name}) must have one ref during teardown",) - }); + // Before taking ownership of the underlay Arcs, clear per-CPU cached + // `DevMap`s and multicast forwarding tables (in underlay ports' `PerEntryState`). + // This breaks snapshot cycles: underlay → per-CPU cache → `DevMap` → + // `XdeDev` → underlay. + // + // Note: Per-port `DevMap` caches (`XdeDev.port_map`) were already cleared + // when ports were deleted. This function only runs after all ports are + // removed. + if let Some(ul_ref) = token.underlay.as_ref() { + let empty_map = Arc::new(DevMap::new()); + let empty_mcast: Arc = Arc::new(BTreeMap::new()); + let underlay_ports = + [&ul_ref.u1.stream.ports_map, &ul_ref.u2.stream.ports_map]; + for per_cpu_map in underlay_ports { + for entry in per_cpu_map { + { + let mut map = entry.devs.lock(); + *map = Arc::clone(&empty_map); + } + { + let mut mcast = entry.mcast_fwd.write(); + *mcast = Arc::clone(&empty_mcast); + } + entry.has_mcast_subscribers.store(false, Ordering::Relaxed); + } + } + } - let name = underlay.u2.name.clone(); - let u2 = Arc::into_inner(underlay.u2).unwrap_or_else(|| { - panic!("underlay u2 ({name}) must have one ref during teardown",) - }); + // Early-check: ensure the underlay port Arcs are uniquely owned by + // XDE before we move them out. In-flight dataplane work may still hold + // references to these Arcs briefly after cache clearing. If so, return + // `EBUSY` so the caller can retry. + if let Some(ul_ref) = token.underlay.as_ref() { + if Arc::strong_count(&ul_ref.u1) != 1 + || Arc::strong_count(&ul_ref.u2) != 1 + { + return Err(OpteError::System { + errno: EBUSY, + msg: "underlay ports still have active references; retry teardown".into(), + }); + } + } - for u in [u1, u2] { - // We have a chain of refs here: `MacSiphon` holds a ref to - // `DlsStream`. We explicitly drop them in order here to ensure - // there are no outstanding refs. - - // 1. Remove packet rx callback. - drop(u.siphon); - - // Although `xde_rx` can be called into without any running ports - // via the siphon handle, illumos guarantees that this callback won't - // be running here. `mac_siphon_clear` performs the moral equivalent of - // `mac_rx_barrier` -- the client's SRS is quiesced, and then restarted - // after the callback is removed. - // Because there are no ports and we hold the write/management lock, no - // one else will have or try to clone the Stream handle. - - // 2. Close the open stream handle. - // The only other hold on this `DlsStream` is via `u.siphon`, which - // we just dropped. The `expect` asserts that we have consumed them - // in the correct order. - Arc::into_inner(u.stream).unwrap_or_else(|| { - panic!( - "underlay ({}) must have no external refs to its DlsStream", - u.name - ) + // Take ownership of the underlay state now that caches are cleared and + // the Arcs appear uniquely owned. + let underlay = token.underlay.take().ok_or_else(|| OpteError::System { + errno: ENOENT, + msg: "underlay not initialized (already checked above)".into(), + })?; + + // Unwrap underlay port Arcs; if any references remain (e.g., in-flight + // dataplane), return EBUSY so caller can retry. + let XdeUnderlayPort { + name: u1_name, + siphon: u1_siphon, + stream: u1_stream, + .. + } = Arc::into_inner(underlay.u1).ok_or_else(|| { + warn!( + "clear_xde_underlay: u1 Arc has outstanding refs after cache clear" + ); + OpteError::System { + errno: EBUSY, + msg: "underlay u1 still has active references during teardown" + .into(), + } + })?; + + let XdeUnderlayPort { + name: u2_name, + siphon: u2_siphon, + stream: u2_stream, + .. + } = Arc::into_inner(underlay.u2).ok_or_else(|| { + warn!( + "clear_xde_underlay: u2 Arc has outstanding refs after cache clear" + ); + OpteError::System { + errno: EBUSY, + msg: "underlay u2 still has active references during teardown" + .into(), + } + })?; + + // Quiesce RX by dropping siphons; this removes the MAC callbacks and + // releases the siphon's Arc reference to the streams' parent. + drop(u1_siphon); + drop(u2_siphon); + + // Verify and close the DLS stream handles. After dropping siphons, the + // only remaining strong reference should be `u*_stream` itself. + for (name, stream) in [(u1_name, u1_stream), (u2_name, u2_stream)] { + if Arc::into_inner(stream).is_none() { + warn!( + "clear_xde_underlay: {name} DlsStream Arc has outstanding refs after siphon drop" + ); + return Err(OpteError::System { + errno: EBUSY, + msg: format!( + "underlay ({name}) DlsStream still has active references; retry teardown" + ), }); } } @@ -1856,7 +2058,7 @@ fn guest_loopback_probe( fn guest_loopback( src_dev: &XdeDev, - dest_dev: &XdeDev, + dst_dev: &XdeDev, port_key: VniMac, mut pkt: MsgBlk, postbox: &mut TxPostbox, @@ -1867,7 +2069,7 @@ fn guest_loopback( // Loopback requires a reparse to account for UFT fastpath. // We might be able to do better, but the logistics in passing around - // the emitspec in lieu of 'full' metadata might be a little troublesome. + // the emitspec in lieu of "full" metadata might be a little troublesome. let parsed_pkt = match Packet::parse_inbound(pkt.iter_mut(), VpcParser {}) { Ok(pkt) => pkt, Err(e) => { @@ -1892,9 +2094,9 @@ fn guest_loopback( let flow = parsed_pkt.flow(); - guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); + guest_loopback_probe(mblk_addr, &flow, src_dev, dst_dev); - match dest_dev.port.process(In, parsed_pkt) { + match dst_dev.port.process(In, parsed_pkt) { Ok(ProcessResult::Modified(emit_spec)) => { let mut pkt = emit_spec.apply(pkt); if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { @@ -1940,7 +2142,7 @@ fn guest_loopback( opte::engine::dbg!( "loopback port process error: {} -> {} {:?}", src_dev.port.name(), - dest_dev.port.name(), + dst_dev.port.name(), e ); } @@ -1953,9 +2155,9 @@ fn guest_loopback( /// Oxide Multicast option (class=0x0129, type=0x01). Returns the offset to the /// option body (after the 4-byte option header) if found. /// -/// # Safety -/// This function validates option headers as it walks to avoid reading beyond -/// packet boundaries. Returns None if the option is not found or if validation fails. +/// Safety: This function validates option headers as it walks to avoid reading +/// beyond packet boundaries. Returns `None` if the option is not found or if +/// validation fails. /// /// # Geneve Option Format /// Each option consists of: @@ -2010,14 +2212,14 @@ fn find_mcast_option_offset( None } -/// Update the Oxide Multicast Geneve option's replication field. +/// Update the Oxide Multicast Geneve option's TX-only replication field. /// -/// Locates the multicast option and rewrites the replication strategy in the +/// Locates the multicast option and rewrites the TX-only replication instruction in the /// first byte of the option body (top 2 bits encode the replication mode). /// /// Returns `true` if the option was found and updated, `false` otherwise. /// -/// # Replication Encoding +/// # Replication Encoding (TX-only) /// The replication field uses the top 2 bits of the first byte: /// - `External` (0): 0x00 /// - `Underlay` (1): 0x40 @@ -2044,27 +2246,9 @@ fn update_mcast_replication( true } -/// Compute the combined replication strategy from a set of next hops. -/// -/// Starts from the first hop's replication and folds the rest using -/// `Replication::merge()` to avoid biasing toward `External`. -/// Returns `None` if `next_hops` is empty. -#[inline] -fn compute_replication_strategy( - next_hops: &BTreeMap, -) -> Option { - let mut acc: Option = None; - for repl in next_hops.values().copied() { - acc = Some(match acc { - None => repl, - Some(cur) => cur.merge(repl), - }); - } - acc -} - struct MulticastTxContext<'a> { - inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) vni: Vni, out_pkt: &'a MsgBlk, encap_len: u32, @@ -2075,52 +2259,37 @@ struct MulticastTxContext<'a> { } struct MulticastRxContext<'a> { - inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) vni: Vni, pkt: &'a MsgBlk, pullup_len: usize, - geneve_offset: usize, - incoming_delivery_mode: Option, + // Reserved for future use: may be needed for relay detection or debugging + _geneve_offset: usize, + _incoming_delivery_mode: Option, } -/// Handle multicast packet forwarding for both external/customer and -/// underlay/infrastructure delivery based on the XDE-wide multicast -/// forwarding table. +/// Handle multicast packet forwarding for same-sled delivery and underlay +/// replication based on the XDE-wide multicast forwarding table. /// -/// - External: Customer-facing members, local guest instances (decapsulated) -/// - Underlay: Infrastructure members, underlay destinations (encapsulated Geneve) +/// Always delivers to local same-sled subscribers regardless of replication mode. +/// Routes to next hop unicast addresses for ALL replication modes to determine +/// reachability and underlay port/MAC. Packet destination is always the multicast +/// address with multicast MAC. The [`Replication`] type is a TX-only instruction +/// telling the switch which port groups to replicate to: External (front panel), +/// Underlay (other sleds), or Both. +/// +/// `cpu_devs` may be None if the fast-path check indicated no local subscribers exist. +/// +/// [`Replication`]: oxide_vpc::api::Replication fn handle_mcast_tx<'a>( ctx: MulticastTxContext, src_dev: &'a XdeDev, postbox: &mut TxPostbox, - entry_state: &mut Option>>, + cpu_devs: Option<&'a DevMap>, + cpu_mcast_fwd: &'a Arc, ) { - // DTrace probe: capture TX entry - let (af, inner_addr_ptr) = match &ctx.inner_dst { - oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) - } - }; - - // Determine replication strategy from XDE-wide multicast forwarding table - let xde = get_xde_state(); - let mgmt = xde.management_lock.lock(); - let mcast_fwd = mgmt.mcast_fwd.read(); - - // Compute combined replication strategy from all next hops to govern local delivery. - let delivery_mode = mcast_fwd - .get(&ctx.inner_dst) - .and_then(compute_replication_strategy) - .unwrap_or(Replication::External); - - // Drop locks before potentially expensive operations - drop(mcast_fwd); - drop(mgmt); - - // DTrace probe: multicast TX entry with delivery mode + // DTrace probe: multicast TX entry let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => { (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) @@ -2129,44 +2298,33 @@ fn handle_mcast_tx<'a>( (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) } }; - __dtrace_probe_mcast__tx( - af, - addr_ptr, - ctx.vni.as_u32() as uintptr_t, - delivery_mode as uintptr_t, - ); - - // External/customer delivery if delivery mode is External or All - // Delivers decapsulated packets to customer-facing members in the same VNI - let do_external = matches!( - delivery_mode, - oxide_vpc::api::Replication::External - | oxide_vpc::api::Replication::All - ); - - if do_external { - let entry_state = - entry_state.get_or_insert_with(|| src_dev.port_map.read()); - if let Some(others) = entry_state.mcast_listeners(&ctx.inner_dst) { + __dtrace_probe_mcast__tx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); + + // Compute packet offsets once (used for both local delivery and next hop forwarding) + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) + + usize::from(ctx.tun_meoi.meoi_l3hlen) + + usize::from(ctx.tun_meoi.meoi_l4hlen); + + // Local same-sled delivery: always deliver to subscribers on this sled, + // independent of the TX-only Replication instruction (not an access control mechanism). + // The Replication type only affects how switches handle the packet on TX. + // Subscription is keyed by underlay (outer) IPv6 multicast address. + // If cpu_devs is None, we know from the fast-path check that no subscribers exist. + if let Some(devs) = cpu_devs { + let group_key = { + let ip6 = oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + oxide_vpc::api::IpAddr::from(ip6) + }; + if let Some(others) = devs.mcast_listeners(&group_key) { let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); for el in others { - // Filter by VNI - only deliver to listeners in the same VNI - if el.vni() != ctx.vni { - continue; - } + // Skip delivering to self if my_key == *el { continue; } - - // This is a more lightweight clone in illumos, and - // gives us an owned form of the headers but a ref - // counted clone of the packet body. - // - // If there are any body transforms internally, OPTE - // will fully clone out the contents if required. - let pullup_len = (ctx.encap_len as usize) - + (ctx.non_eth_payl_bytes as usize) - + ctx.inner_eth_len; let Ok(my_pkt) = ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) else { @@ -2176,9 +2334,12 @@ fn handle_mcast_tx<'a>( ); let xde = get_xde_state(); xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail( + pullup_len as uintptr_t, + ); continue; }; - match entry_state.get_by_key(*el) { + match devs.get_by_key(*el) { Some(dev) => { // DTrace probe: local delivery let (af, addr_ptr) = match &ctx.inner_dst { @@ -2199,176 +2360,223 @@ fn handle_mcast_tx<'a>( ); guest_loopback(src_dev, dev, *el, my_pkt, postbox); let xde = get_xde_state(); - xde.stats.vals.mcast_tx_external().incr(1); + xde.stats.vals.mcast_tx_local().incr(1); } None => { let xde = get_xde_state(); - xde.stats.vals.mcast_tx_stale_external().incr(1); + xde.stats.vals.mcast_tx_stale_local().incr(1); } } } } } - // Underlay/infrastructure forwarding only if the merged delivery mode - // calls for it. External-only means local delivery only, no underlay fanout. - let do_underlay = matches!( - delivery_mode, - oxide_vpc::api::Replication::Underlay - | oxide_vpc::api::Replication::All - ); - - if do_underlay { - // Re-acquire locks for underlay forwarding + // Next hop forwarding: send packets to configured next hops. + // + // At the leaf level, we process all next hops in the forwarding table. + // Each next hop's `Replication` is a TX-only instruction telling the switch + // which ports to replicate to: + // - External: ports set for external multicast traffic (egress to external networks) + // - Underlay: replicate to other sleds (using multicast outer dst) + // - Both: both external and underlay replication + // + // We already have the Arc from the per-CPU cache, no need to clone. + if cpu_mcast_fwd.get(&ctx.underlay_dst).is_none() { + __dtrace_probe_mcast__no__fwd__entry( + &ctx.underlay_dst, + ctx.vni.as_u32() as uintptr_t, + ); let xde = get_xde_state(); - let mgmt = xde.management_lock.lock(); - let mcast_fwd = mgmt.mcast_fwd.read(); - - if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { - // We found forwarding entries, replicate to each next hop - for (next_hop, replication) in next_hops.iter() { - // Clone packet with headers using pullup - let pullup_len = (ctx.encap_len as usize) - + (ctx.non_eth_payl_bytes as usize) - + ctx.inner_eth_len; - let Ok(mut fwd_pkt) = - ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) - else { - opte::engine::dbg!( - "mcast TX underlay pullup failed: requested {} bytes", - pullup_len - ); - let xde = get_xde_state(); - xde.stats.vals.mcast_tx_pullup_fail().incr(1); - continue; // Skip this destination on allocation failure - }; - - // Modify VNI in Geneve header to next_hop.vni - // Geneve header follows outer Ethernet + IPv6 + UDP - let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) - + usize::from(ctx.tun_meoi.meoi_l3hlen) - + usize::from(ctx.tun_meoi.meoi_l4hlen); - - // Determine the actual outer IPv6 destination and whether to modify it - // - External: Override with unicast next_hop.addr for delivery to specific host - // - Underlay/All: Keep the multicast underlay address from OPTE (already set via M2P) - let ipv6_offset = usize::from(ctx.tun_meoi.meoi_l2hlen); - let actual_outer_dst = match replication { - oxide_vpc::api::Replication::External => { - // External replication: override with unicast destination - let ipv6_dst_offset = ipv6_offset + 24; - if let Some(dst_bytes) = fwd_pkt - .get_mut(ipv6_dst_offset..ipv6_dst_offset + 16) - { - dst_bytes.copy_from_slice(AsRef::<[u8]>::as_ref( - &next_hop.addr, - )); - } - next_hop.addr // Use unicast address for routing - } - oxide_vpc::api::Replication::Underlay - | oxide_vpc::api::Replication::All => { - // Underlay/All replication: The packet already has the correct - // multicast underlay address from OPTE's M2P mapping. - // Do NOT override it - just get it for route lookup - let xde = get_xde_state(); - match xde - .vpc_map - .get_mcast_underlay(ctx.vni, ctx.inner_dst) - { - Some(mcast_ul) => mcast_ul.addr(), // Use multicast address for routing - None => { - // No M2P mapping - skip this destination - continue; - } - } - } - _ => { - // Reserved or unknown replication type - skip - continue; - } - }; + xde.stats.vals.mcast_tx_no_fwd_entry().incr(1); + } - // VNI is at offset 4 in Geneve header (3 bytes) - if let Some(vni_bytes) = - fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) - { - let vni_be = next_hop.vni.as_u32().to_be_bytes(); - vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits - } - // Update Geneve multicast option to reflect underlay replication to prevent re-relay loops. - update_mcast_replication( - &mut fwd_pkt, - geneve_offset, - *replication, + if let Some(next_hops) = cpu_mcast_fwd.get(&ctx.underlay_dst) { + // We found forwarding entries, replicate to each next hop + for (next_hop, replication) in next_hops.iter() { + // Clone packet with headers using pullup + let Ok(mut fwd_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast TX next hop pullup failed: requested {} bytes", + pullup_len ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail(pullup_len as uintptr_t); + continue; // Skip this destination on allocation failure + }; - // Route lookup for next hop to get outer MAC addresses - // Use the actual_outer_dst we determined above - let route_key = RouteKey { - dst: actual_outer_dst, - l4_hash: Some(ctx.l4_hash), - }; - let Route { src: mac_src, dst: mac_dst, underlay_idx } = - src_dev.routes.next_hop(route_key, src_dev); + // Route to next hop unicast address to determine which underlay + // port/MAC to use. Packet destination is the multicast address with + // multicast MAC (RFC 2464). + // + // NextHopV6.addr = unicast switch address (for routing) + // Outer dst IP = ctx.underlay_dst (multicast address from M2P) + // Geneve Replication is a TX-only instruction telling the switch + // which port groups to use. + let routing_dst = next_hop.addr; + let actual_outer_dst = ctx.underlay_dst; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits + } + // Update Geneve multicast option to reflect underlay replication to + // prevent re-relay loops. + update_mcast_replication(&mut fwd_pkt, geneve_offset, *replication); + + // Route to switch unicast address to determine which underlay + // port/MAC to use. Packet destination is multicast address with + // multicast MAC. + let route_key = + RouteKey { dst: routing_dst, l4_hash: Some(ctx.l4_hash) }; + let Route { src: mac_src, dst: _mac_dst, underlay_idx } = + src_dev.routes.next_hop(route_key, src_dev); + + // Derive destination MAC from IPv6 multicast address per RFC 2464: + // IPv6 multicast MAC = 33:33 + last 4 bytes of IPv6 address + let ipv6_bytes = actual_outer_dst.bytes(); + let dst_mac = EtherAddr::from([ + 0x33, + 0x33, + ipv6_bytes[12], + ipv6_bytes[13], + ipv6_bytes[14], + ipv6_bytes[15], + ]); + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(dst_mac.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // Replication is a TX-only instruction telling the switch which + // port groups to replicate to: + // + // Local same-sled delivery always occurs regardless of this + // TX-only setting. + // + // Note: Packet is sent once to the underlay. The switch reads the + // Geneve Replication field and performs the actual bifurcation. + match replication { + oxide_vpc::api::Replication::Underlay => { + // DTrace probe: underlay forwarding + // Report on-wire multicast group as GROUP (underlay), + // and configured next-hop leaf address as NEXTHOP. + let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( + actual_outer_dst.bytes(), + ); + let (af, addr_ptr) = + (26usize, &outer_ip6 as *const _ as uintptr_t); + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); - // Fill in outer MAC addresses - let final_pkt = unsafe { - let mblk = fwd_pkt.unwrap_mblk().as_ptr(); - let rptr = (*mblk).b_rptr; - ptr::copy(mac_dst.as_ptr(), rptr, 6); - ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + // Send to underlay + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); - // Note: The outer IPv6 destination was already set correctly in fwd_pkt - // based on the replication type, and we used the correct address for - // route lookup, so no need to modify it here. + // Increment underlay forwarding stat + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + } + oxide_vpc::api::Replication::Both => { + // Both mode: packet is sent to switch with "Both" + // replication flag. + // Switch will bifurcate to both underlay and external port + // groups. Fire both DTrace probes and increment both stats + // for observability. + let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( + actual_outer_dst.bytes(), + ); + let (af, addr_ptr) = + (26usize, &outer_ip6 as *const _ as uintptr_t); - MsgBlk::wrap_mblk(mblk).unwrap() - }; + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); - // DTrace probe: underlay forwarding - let (af, addr_ptr) = match &ctx.inner_dst { - oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, v4 as *const _ as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, v6 as *const _ as uintptr_t) - } - }; - __dtrace_probe_mcast__underlay__fwd( - af, - addr_ptr, - ctx.vni.as_u32() as uintptr_t, - &next_hop.addr, - ); + // Send to underlay (switch does bifurcation) + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); - // Send to underlay - postbox.post_underlay( - underlay_idx, - TxHint::from_crc32(ctx.l4_hash), - final_pkt, - ); + // Increment both stats since both replication paths are active + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + xde.stats.vals.mcast_tx_external().incr(1); + } + oxide_vpc::api::Replication::External => { + // DTrace probe: external forwarding + // Report on-wire multicast group as GROUP (underlay), + // and configured next-hop leaf address as NEXTHOP. + let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( + actual_outer_dst.bytes(), + ); + let (af, addr_ptr) = + (26usize, &outer_ip6 as *const _ as uintptr_t); + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); - // Increment underlay forwarding stat - let xde = get_xde_state(); - xde.stats.vals.mcast_tx_underlay().incr(1); + // Increment external forwarding stat + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_external().incr(1); + + // External mode: Unicast Geneve to switch (boundary service) via underlay. + // Switch decaps and replicates to ports set for external multicast traffic + // (egress to external networks, leaving the underlay). + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); + } + _ => { + // Reserved: should not reach here + } } - - // Release locks - drop(mcast_fwd); - drop(mgmt); } } } /// Handle multicast packet reception from the underlay. /// -/// This function processes incoming multicast packets and: -/// - Delivers to external/customer members in the same VNI (local listeners) -/// - Optionally forwards to underlay/infrastructure members (if acting as relay) +/// OPTE is always a leaf node in the multicast replication tree. +/// This function only delivers packets to local subscribers. /// -/// Unlike Tx path which originates from a port, Rx path receives from underlay -/// and needs to determine all appropriate destinations. +/// The Replication type is TX-only (instructions to the switch), so the +/// replication field is ignored on RX. Local delivery is based purely on +/// subscriptions. fn handle_mcast_rx( ctx: MulticastRxContext, stream: &DlsStream, @@ -2384,125 +2592,35 @@ fn handle_mcast_rx( (26usize, v6 as *const _ as uintptr_t) } }; - __dtrace_probe_mcast__rx( - af, - addr_ptr, - ctx.vni.as_u32() as uintptr_t, - ctx.incoming_delivery_mode.map(|r| r as uintptr_t).unwrap_or(0), - ); - - // Determine replication strategy from XDE-wide multicast forwarding table - let xde = get_xde_state(); - let mgmt = xde.management_lock.lock(); - let mcast_fwd = mgmt.mcast_fwd.read(); - - // Compute combined replication strategy from all next hops - let has_fwd_entry = mcast_fwd.get(&ctx.inner_dst).is_some(); - let delivery_mode = mcast_fwd - .get(&ctx.inner_dst) - .and_then(compute_replication_strategy) - .unwrap_or(Replication::External); - - // Drop locks before potentially expensive operations - drop(mcast_fwd); - drop(mgmt); - - // If no forwarding entry exists, check for local listeners only - if !has_fwd_entry { - if let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { - // Deliver to local listeners in the same VNI only - for el in ports { - // Filter by VNI - only deliver to listeners in the incoming packet's VNI - if el.vni() != ctx.vni { - continue; - } + __dtrace_probe_mcast__rx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); - let Ok(my_pkt) = - ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) - else { - opte::engine::dbg!( - "mcast RX external pullup failed: requested {} bytes", - ctx.pullup_len - ); - let xde = get_xde_state(); - xde.stats.vals.mcast_rx_pullup_fail().incr(1); - continue; - }; - match devs.get_by_key(*el) { - Some(dev) => { - // DTrace probe: RX local delivery - let (af, addr_ptr) = match &ctx.inner_dst { - oxide_vpc::api::IpAddr::Ip4(v4) => ( - 2usize, - AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, - ), - oxide_vpc::api::IpAddr::Ip6(v6) => ( - 26usize, - AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, - ), - }; - __dtrace_probe_mcast__local__delivery( - af, - addr_ptr, - ctx.vni.as_u32() as uintptr_t, - dev.port.name_cstr().as_ptr() as uintptr_t, - ); - xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); - let xde = get_xde_state(); - xde.stats.vals.mcast_rx_external().incr(1); - } - None => { - let xde = get_xde_state(); - xde.stats.vals.mcast_rx_stale_external().incr(1); - } - } - } - } else { - // No forwarding entry and no local listeners - let xde = get_xde_state(); - xde.stats.vals.mcast_rx_no_fwd_entry().incr(1); - } - return; - } - - // External/customer delivery if delivery mode is External or All. - // - // Loop Prevention: If the incoming packet has Underlay or All replication set, - // it means this packet has already been relayed by another host and we should - // NOT deliver it locally. This prevents: - // - Duplicate delivery to local listeners - // - Infinite forwarding loops in the underlay network - let do_external = matches!( - delivery_mode, - oxide_vpc::api::Replication::External - | oxide_vpc::api::Replication::All - ) && !matches!( - ctx.incoming_delivery_mode, - Some(oxide_vpc::api::Replication::Underlay) - | Some(oxide_vpc::api::Replication::All) - ); + // Subscription is keyed by underlay (outer) IPv6 multicast address. + // This uniquely identifies the multicast group across the fleet. + let group_key = { + let ip6 = oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + oxide_vpc::api::IpAddr::from(ip6) + }; - if do_external && let Some(ports) = devs.mcast_listeners(&ctx.inner_dst) { - // Deliver to local listeners in the same VNI only + // Deliver to all local subscribers. VNI validation and VPC isolation + // are handled by OPTE's inbound overlay layer. + if let Some(ports) = devs.mcast_listeners(&group_key) { for el in ports { - // Filter by VNI - only deliver to listeners in the incoming packet's VNI - if el.vni() != ctx.vni { - continue; - } - let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) else { opte::engine::dbg!( - "mcast RX external pullup failed: requested {} bytes", + "mcast RX pullup failed: requested {} bytes", ctx.pullup_len ); let xde = get_xde_state(); xde.stats.vals.mcast_rx_pullup_fail().incr(1); + __dtrace_probe_mcast__rx__pullup__fail( + ctx.pullup_len as uintptr_t, + ); continue; }; match devs.get_by_key(*el) { Some(dev) => { - // DTrace probe: RX local delivery (with forwarding entry) + // DTrace probe: RX local delivery let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => { (2usize, v4 as *const _ as uintptr_t) @@ -2519,128 +2637,24 @@ fn handle_mcast_rx( ); xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); let xde = get_xde_state(); - xde.stats.vals.mcast_rx_external().incr(1); + xde.stats.vals.mcast_rx_local().incr(1); } None => { let xde = get_xde_state(); - xde.stats.vals.mcast_rx_stale_external().incr(1); + xde.stats.vals.mcast_rx_stale_local().incr(1); } } } - } - - // Underlay/infrastructure forwarding if delivery mode is Underlay or All - // For Rx path, this would mean we're acting as a multicast relay/router - // - // Loop prevention: Don't relay if incoming packet already has Underlay or All - // replication set in its Geneve option, as this indicates it has already been - // relayed by another host. - let should_relay = matches!( - delivery_mode, - oxide_vpc::api::Replication::Underlay - | oxide_vpc::api::Replication::All - ) && !matches!( - ctx.incoming_delivery_mode, - Some(oxide_vpc::api::Replication::Underlay) - | Some(oxide_vpc::api::Replication::All) - ); - - if should_relay { - // Re-acquire locks for underlay forwarding + } else { + // No subscription entry found for this multicast group + let underlay_ip6 = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + __dtrace_probe_mcast__no__fwd__entry( + &underlay_ip6, + ctx.vni.as_u32() as uintptr_t, + ); let xde = get_xde_state(); - let mgmt = xde.management_lock.lock(); - let mcast_fwd = mgmt.mcast_fwd.read(); - - if let Some(next_hops) = mcast_fwd.get(&ctx.inner_dst) { - // Get routing info from any local device (all share same underlay) - let routing_dev = devs.iter().next(); - - for (next_hop, repl) in next_hops.iter() { - // Only forward to underlay destinations - if !matches!( - repl, - oxide_vpc::api::Replication::Underlay - | oxide_vpc::api::Replication::All - ) { - continue; - } - - // Clone the packet for this destination - let Ok(mut fwd_pkt) = - ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) - else { - opte::engine::dbg!( - "mcast RX underlay relay pullup failed: requested {} bytes", - ctx.pullup_len - ); - let xde = get_xde_state(); - xde.stats.vals.mcast_rx_pullup_fail().incr(1); - continue; - }; - - // NOTE: For multicast underlay relaying, we do NOT modify the outer - // IPv6 destination. It's already set to the multicast underlay address - // (e.g., ff04::...224.1.2.4) by OPTE's encapsulation layer. - // The next_hop.addr is only used for routing/MAC lookup, which returns - // MAC addresses without modifying the packet. - - // Modify VNI in Geneve header to next_hop.vni - // Use the Geneve offset calculated from parsed headers to handle VLANs and IPv6 extensions - let geneve_offset = ctx.geneve_offset; - - // VNI is at offset 4 in Geneve header (3 bytes) - if let Some(vni_bytes) = - fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) - { - let vni_be = next_hop.vni.as_u32().to_be_bytes(); - vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits - } - // Mark multicast replication as Underlay/All to avoid re-relay by downstream receivers. - update_mcast_replication(&mut fwd_pkt, geneve_offset, *repl); - - // Compute hash once for both routing and flow distribution - let l4_hash = { - use core::hash::Hash; - let mut hasher = crc32fast::Hasher::new(); - next_hop.addr.hash(&mut hasher); - hasher.finalize() - }; - - // Get routing information if we have a device - let (mac_src, mac_dst) = if let Some(dev) = routing_dev { - let route_key = - RouteKey { dst: next_hop.addr, l4_hash: Some(l4_hash) }; - let Route { src, dst, .. } = - dev.routes.next_hop(route_key, dev); - (src, dst) - } else { - // No devices available for routing - use zero MACs - use opte::engine::ether::EtherAddr; - (EtherAddr::zero(), EtherAddr::zero()) - }; - - // Fill in outer MAC addresses - let final_pkt = unsafe { - let mblk = fwd_pkt.unwrap_mblk().as_ptr(); - let rptr = (*mblk).b_rptr; - ptr::copy(mac_dst.as_ptr(), rptr, 6); - ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); - MsgBlk::wrap_mblk(mblk).unwrap() - }; - - // Send to underlay via stream (same underlay we received from) - stream.tx_drop_on_no_desc( - final_pkt, - TxHint::from_crc32(l4_hash), - MacTxFlags::empty(), - ); - - xde.stats.vals.mcast_rx_underlay().incr(1); - } - } - - drop(mcast_fwd); - drop(mgmt); + xde.stats.vals.mcast_rx_no_subscribers().incr(1); } } @@ -2680,34 +2694,39 @@ unsafe extern "C" fn xde_mc_tx( let mut hairpin_chain = MsgBlkChain::empty(); let mut tx_postbox = TxPostbox::new(); - // We don't need to read-lock the port map unless we have local - // delivery to perform. - // - // TODO: really think this one through. This might expose us to the - // risk of double read-locking at the same time as the tokenlock - // wants to make some globally mutable operation happen. - // - // Maybe we should clone out the `DevMap` at this instant. - let mut entry_state = None; + // Clone per-CPU mcast forwarding table Arc and drop lock immediately. + // This makes the reader lock-free and avoids blocking management refreshes. + let cpu_index = current_cpu().seq_id; + let cpu_entry = &src_dev.u1.stream.ports_map[cpu_index]; + let mcast_fwd = clone_from_rwlock(&cpu_entry.mcast_fwd); + + // Lazily clone per-port DevMap Arc for hairpin/local delivery. + // Cloning the Arc (not holding a read guard) eliminates re-entrant + // read deadlock risk and avoids blocking management operations. + let mut cached_devmap: Option> = None; while let Some(pkt) = chain.pop_front() { xde_mc_tx_one( src_dev, pkt, &mut tx_postbox, - &mut entry_state, + cpu_entry, + &mut cached_devmap, + &mcast_fwd, &mut hairpin_chain, ); } let (local_pkts, [u1_pkts, u2_pkts]) = tx_postbox.deconstruct(); - if let Some(entry_state) = entry_state { - entry_state.deliver_all(local_pkts); + // Local same-sled delivery (via mac_rx to guest ports) is safe. + // Lazily clone DevMap if we have anything to deliver. + if !local_pkts.is_empty() { + let devs = cached_devmap + .get_or_insert_with(|| clone_from_rwlock(&src_dev.port_map)); + devs.deliver_all(local_pkts); } - // `entry_state` has been moved, making it safe to deliver hairpin - // packets (which may cause us to re-enter XDE in the same stack). // All deliver/tx calls will NO-OP if the sent chain is empty. src_dev.deliver(hairpin_chain); @@ -2731,7 +2750,9 @@ fn xde_mc_tx_one<'a>( src_dev: &'a XdeDev, mut pkt: MsgBlk, postbox: &mut TxPostbox, - entry_state: &mut Option>>, + cpu_entry: &'a PerEntryState, + cached_devmap: &mut Option>, + mcast_fwd: &'a Arc, hairpin_chain: &mut MsgBlkChain, ) { let parser = src_dev.port.network().parser(); @@ -2760,9 +2781,6 @@ fn xde_mc_tx_one<'a>( let meta = parsed_pkt.meta(); // Extract inner destination IP for potential multicast processing - use opte::engine::ip::ValidL3; - use opte::engine::ip::v4::Ipv4Ref; - use opte::engine::ip::v6::Ipv6Ref; let inner_dst_ip = match &meta.inner_l3 { Some(ValidL3::Ipv4(v4)) => { Some(oxide_vpc::api::IpAddr::from(v4.destination())) @@ -2803,14 +2821,10 @@ fn xde_mc_tx_one<'a>( return; } - // Multicast packets go through normal port.process() which will use M2P - // for encapsulation. After that, we intercept them for unicast replication. - let port = &src_dev.port; // The port processing code will fire a probe that describes what - // action was taken -- there should be no need to add probes or - // prints here. + // action was taken. let res = port.process(Direction::Out, parsed_pkt); match res { @@ -2861,15 +2875,16 @@ fn xde_mc_tx_one<'a>( let new_len = out_pkt.byte_len(); if ip6_src == ip6_dst { - let entry_state = - entry_state.get_or_insert_with(|| src_dev.port_map.read()); - + // Hairpin loopback: same-host delivery let key = VniMac::new(vni, ether_dst); - if let Some(dest_dev) = entry_state.get_by_key(key) { + let devs = cached_devmap.get_or_insert_with(|| { + clone_from_rwlock(&src_dev.port_map) + }); + if let Some(dst_dev) = devs.get_by_key(key) { // We have found a matching Port on this host; "loop back" // the packet into the inbound processing path of the // destination Port. - guest_loopback(src_dev, dest_dev, key, out_pkt, postbox); + guest_loopback(src_dev, dst_dev, key, out_pkt, postbox); } else { opte::engine::dbg!( "underlay dest is same as src but the Port was not found \ @@ -2889,19 +2904,24 @@ fn xde_mc_tx_one<'a>( return; }; - // For a multicast outbound frame, deliver to external/customer members - // (local guest instances) and/or underlay/infrastructure members - // based on the replication configuration. - // Check if this is a multicast packet by examining the outer IPv6 destination - // For multicast, OPTE should have set it to an ff0x:: address + // Multicast interception: All packets (unicast and multicast) go + // through normal `port.process()` which applies router/firewall + // rules and uses M2P for multicast encapsulation. Here, we + // intercept multicast packets for replication to multiple next-hops + // and local delivery to subscribers. + // + // Check if this is a multicast packet by examining the outer IPv6 + // destination. For multicast, OPTE should have set it to an + // ff0x:: address (via M2P table). let is_mcast_packet = ip6_dst.is_multicast(); if is_mcast_packet { - // This is a multicast packet - determine the inner destination - // from the packet contents or use a fallback + // This is a multicast packet, so we determine the inner + // destination from the packet contents or use a fallback let inner_dst = inner_dst_ip.unwrap_or_else(|| { // Fallback: derive from outer IPv6 multicast address - // For IPv4 multicast mapped to IPv6, the last 4 bytes contain the IPv4 address + // For IPv4 multicast mapped to IPv6, the last 4 bytes + // contain the IPv4 address if ip6_dst.bytes()[0] == 0xff && ip6_dst.bytes()[1] == 0x04 { // Admin-scoped IPv6 multicast, likely mapped from IPv4 @@ -2917,9 +2937,24 @@ fn xde_mc_tx_one<'a>( } }); + // Lazily obtain per-port DevMap for local delivery. + // Use fast-path check to avoid locking when no local subscribers exist. + let devs = if cached_devmap.is_none() + && !cpu_entry.has_mcast_subscribers.load(Ordering::Relaxed) + { + // Fast path: no subscribers, skip `DevMap` entirely + None + } else { + // Either we already have `DevMap`, or we need to get it + Some(cached_devmap.get_or_insert_with(|| { + clone_from_rwlock(&src_dev.port_map) + })) + }; + handle_mcast_tx( MulticastTxContext { inner_dst, + underlay_dst: ip6_dst, vni, out_pkt: &out_pkt, encap_len, @@ -2930,7 +2965,8 @@ fn xde_mc_tx_one<'a>( }, src_dev, postbox, - entry_state, + devs.as_deref().map(|v| &**v), + mcast_fwd, ); return; } @@ -3034,10 +3070,10 @@ fn xde_mc_tx_one<'a>( Ok(ProcessResult::Drop { .. }) => {} Ok(ProcessResult::Hairpin(hpkt)) => { - // From the theory statement, if we have a packet chain - // from above which contains a mixture of hairpin and local - // deliveries (`guest_loopback`) we can only deliver hairpin - // packets once `entry_state` is explicitly dropped. + // Hairpin packets are queued for later delivery. If we have a + // packet chain containing both hairpin and local deliveries + // (via `guest_loopback`), we defer hairpin delivery until after + // local delivery completes to avoid potential re-entrancy issues. hairpin_chain.append(hpkt); } @@ -3182,7 +3218,8 @@ fn new_port( name: String, cfg: &VpcCfg, vpc_map: Arc, - vni_state: Arc, + m2p: Arc, + v2p: Arc, v2b: Arc, ectx: Arc, dhcp_cfg: &DhcpCfg, @@ -3205,7 +3242,7 @@ fn new_port( gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, vni_state, vpc_map, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, v2p, m2p, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual @@ -3217,7 +3254,6 @@ fn new_port( let limit = NonZeroU32::new(FW_FT_LIMIT.get().max(nat_ft_limit.get())).unwrap(); let net = VpcNetwork { cfg }; - #[allow(clippy::arc_with_non_send_sync)] let port = Arc::new(pb.create(net, limit, limit)?); Ok(port) } @@ -3259,17 +3295,20 @@ unsafe extern "C" fn xde_rx( let mut count = 0; let mut len = 0; - // Acquire our own dev map -- this gives us access to prebuilt postboxes - // for all active ports. We don't worry about this changing for rx -- caller - // threads here (interrupt contexts, poll threads, fanout, worker threads) - // are all bound to a given CPU each by MAC. + // Clone per-CPU DevMap Arc and drop lock immediately. + // This makes RX readers lock-free and avoids blocking management refreshes. + // + // Safety: `devmap` holds `Arc`, which contains `Arc` entries. + // This reference chain keeps all ports in this snapshot alive throughout + // packet processing, ensuring `deliver_all()` operates on live `XdeDev` + // instances even if ports are concurrently removed from the canonical map. let cpu_index = current_cpu().seq_id; - let cpu_state = stream.ports_map[cpu_index].devs.lock(); + let devmap = clone_from_mutex(&stream.ports_map[cpu_index].devs); let mut postbox = Postbox::new(); while let Some(pkt) = chain.pop_front() { if let Some(pkt) = - xde_rx_one(&stream.stream, pkt, &cpu_state, &mut postbox) + xde_rx_one(&stream.stream, pkt, &devmap, &mut postbox) { count += 1; len += pkt.byte_len(); @@ -3277,7 +3316,7 @@ unsafe extern "C" fn xde_rx( } } - cpu_state.deliver_all(postbox); + devmap.deliver_all(postbox); let (head, tail) = out_chain .unwrap_head_and_tail() @@ -3350,6 +3389,11 @@ fn xde_rx_one( let ip6_dst = meta.outer_v6.destination(); if ip6_dst.is_multicast() { + // Fast path: if no multicast subscribers exist, drop immediately + if !devs.has_mcast_subscribers() { + return None; + } + let pullup_len = ( &meta.outer_eth, &meta.outer_v6, @@ -3366,16 +3410,27 @@ fn xde_rx_one( ); let vni = meta.outer_encap.vni(); + // Validate Geneve options per RFC 8926 + if let Err(e) = + oxide_vpc::engine::geneve::validate_options(&meta.outer_encap) + { + stat_parse_error(Direction::In, &e); + opte::engine::dbg!( + "Invalid Geneve options in multicast packet: {:?}", + e + ); + bad_packet_parse_probe(None, Direction::In, mblk_addr, &e); + return Some(pkt); + } + // Extract inner destination IP for multicast processing - use opte::engine::ip::ValidL3; - use opte::engine::ip::v4::Ipv4Ref; - use opte::engine::ip::v6::Ipv6Ref; let inner_dst = match &meta.inner_l3 { ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), }; // Extract multicast delivery mode from Geneve options + // (Safe to be lenient for non-critical parse errors after validation above) let incoming_delivery_mode = oxide_vpc::engine::geneve::extract_multicast_replication( &meta.outer_encap, @@ -3389,15 +3444,17 @@ fn xde_rx_one( // Drop the parsed packet before calling handle_mcast_rx drop(parsed_pkt); - // Handle multicast packets using the XDE-wide forwarding table + // Handle multicast packets, delivering to local subscribers only + // (leaf node) handle_mcast_rx( MulticastRxContext { inner_dst, + underlay_dst: ip6_dst, vni, pkt: &pkt, pullup_len, - geneve_offset, - incoming_delivery_mode, + _geneve_offset: geneve_offset, + _incoming_delivery_mode: incoming_delivery_mode, }, stream, devs, @@ -3713,35 +3770,36 @@ fn dump_v2p_hdlr() -> Result { fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetMcast2PhysReq = env.copy_in_req()?; - // Validate VNI is DEFAULT_MULTICAST_VNI for fleet-level multicast - if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { - return Err(OpteError::System { - // Propagate an actionable errno so userspace sees an error - errno: EINVAL, - msg: format!( - "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", - DEFAULT_MULTICAST_VNI, - req.vni.as_u32() - ), - }); - } - - // Validate underlay multicast address is admin-scoped IPv6 (ff04, ff05, or ff08) - // Per Omicron constraints: underlay must be admin-scoped for rack-internal routing - let first_byte = req.underlay.bytes()[0]; - let second_byte = req.underlay.bytes()[1]; - // Check if it's multicast (ff00::/8) and admin-scoped (ff04, ff05, ff08) - if first_byte != 0xff - || (second_byte != 0x04 && second_byte != 0x05 && second_byte != 0x08) - { + // Validate underlay multicast address is admin-local IPv6 (ff04::/16 only) + // Per Omicron constraints: underlay must be admin-local for rack-internal routing + if !req.underlay.is_admin_scoped_multicast() { return Err(OpteError::InvalidUnderlayMulticast(format!( - "underlay multicast address must be admin-scoped IPv6 (ff04::/16, ff05::/16, or ff08::/16), got: {}", + "underlay multicast address must be admin-local IPv6 (ff04::/16), got: {}", req.underlay ))); } + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); let state = get_xde_state(); - state.vpc_map.add_mcast(req.group, req.underlay, req.vni)?; + // Underlay address validated above as admin-local (ff04::/16) + state.m2p.set(req.group, overlay::MulticastUnderlay(req.underlay)); + + // DTrace: multicast map set + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__map__set( + af as uintptr_t, + group_ptr, + &req.underlay, + vni.as_u32() as uintptr_t, + ); Ok(NoResp::default()) } @@ -3749,20 +3807,26 @@ fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { let req: ClearMcast2PhysReq = env.copy_in_req()?; - // Validate VNI is DEFAULT_MULTICAST_VNI (77) for fleet-level multicast - if req.vni.as_u32() != DEFAULT_MULTICAST_VNI { - return Err(OpteError::System { - errno: EINVAL, - msg: format!( - "multicast VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", - DEFAULT_MULTICAST_VNI, - req.vni.as_u32() - ), - }); - } - + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); let state = get_xde_state(); - state.vpc_map.del_mcast(req.group, req.underlay, req.vni); + state.m2p.remove(&req.group); + + // DTrace: multicast map clear + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__map__clear( + af as uintptr_t, + group_ptr, + &req.underlay, + vni.as_u32() as uintptr_t, + ); Ok(NoResp::default()) } @@ -3795,31 +3859,76 @@ fn set_mcast_forwarding_hdlr( let req: SetMcastForwardingReq = env.copy_in_req()?; let state = get_xde_state(); - // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for any next hop - // that will result in underlay forwarding (Underlay/All). - for (nh, rep) in &req.next_hops { - if matches!(rep, Replication::Underlay | Replication::All) - && nh.vni.as_u32() != DEFAULT_MULTICAST_VNI - { + // Validate underlay address is admin-local IPv6 multicast (ff04::/16 only) + if !req.underlay.is_admin_scoped_multicast() { + return Err(OpteError::InvalidUnderlayMulticast(format!( + "underlay multicast address must be admin-local IPv6 (ff04::/16), got: {}", + req.underlay + ))); + } + + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication modes. + // NextHopV6.addr must be unicast (switch address for routing). + // The packet will be sent to the multicast address (req.underlay). + for (nh, _rep) in &req.next_hops { + if nh.vni.as_u32() != DEFAULT_MULTICAST_VNI { return Err(OpteError::System { errno: EINVAL, msg: format!( - "multicast next-hop VNI must be DEFAULT_MULTICAST_VNI ({}), got: {}", - DEFAULT_MULTICAST_VNI, + "multicast next-hop VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", nh.vni.as_u32() ), }); } + + // NextHopV6.addr must be unicast (the switch endpoint for routing). + // The actual packet destination is the multicast address (req.underlay). + if nh.addr.is_multicast() { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "NextHopV6.addr must be unicast (switch address), got multicast: {}", + nh.addr + ), + }); + } } + // Record next-hop count and copy underlay before consuming the vector + let next_hop_count = req.next_hops.len(); + let underlay = req.underlay; + let token = state.management_lock.lock(); - let mut mcast_fwd = token.mcast_fwd.write(); + { + let mut mcast_fwd = token.mcast_fwd.write(); - // Convert Vec into BTreeMap - let next_hop_map: BTreeMap = - req.next_hops.into_iter().collect(); + // Get or create the next-hop map for this underlay address + let next_hop_map = + mcast_fwd.entry(underlay).or_insert_with(BTreeMap::new); - mcast_fwd.insert(req.group, next_hop_map); + // Insert/update next-hops: same next-hop addr → replace replication mode, + // different next-hop addr → add new entry (like swadm route add) + for (nh, rep) in req.next_hops { + next_hop_map.insert(nh, rep); + } + + drop(mcast_fwd); + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps(devs, underlay, &token.mcast_fwd); + } + } + + // DTrace: forwarding set + __dtrace_probe_mcast__fwd__set( + &underlay, + next_hop_count as uintptr_t, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); Ok(NoResp::default()) } @@ -3832,9 +3941,25 @@ fn clear_mcast_forwarding_hdlr( let state = get_xde_state(); let token = state.management_lock.lock(); - let mut mcast_fwd = token.mcast_fwd.write(); + { + let mut mcast_fwd = token.mcast_fwd.write(); + mcast_fwd.remove(&req.underlay); + drop(mcast_fwd); + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps(devs, underlay, &token.mcast_fwd); + } + } - mcast_fwd.remove(&req.group); + // DTrace: forwarding clear + __dtrace_probe_mcast__fwd__clear( + &req.underlay, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); Ok(NoResp::default()) } @@ -3848,8 +3973,8 @@ fn dump_mcast_forwarding_hdlr() -> Result { let entries: Vec = mcast_fwd .iter() - .map(|(group, next_hops)| McastForwardingEntry { - group: *group, + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, next_hops: next_hops.iter().map(|(nh, rep)| (*nh, *rep)).collect(), }) .collect(); @@ -3857,6 +3982,23 @@ fn dump_mcast_forwarding_hdlr() -> Result { Ok(DumpMcastForwardingResp { entries }) } +fn dump_mcast_subscriptions_hdlr() +-> Result { + let state = get_xde_state(); + let token = state.management_lock.lock(); + let devs = token.devs.read(); + + let mut entries: alloc::vec::Vec = + alloc::vec::Vec::new(); + for (group, ports) in devs.dump_mcast_subscriptions().into_iter() { + if let opte::api::IpAddr::Ip6(underlay) = group { + entries.push(McastSubscriptionEntry { underlay, ports }); + } + } + + Ok(DumpMcastSubscriptionsResp { entries }) +} + #[unsafe(no_mangle)] fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { let req: McastSubscribeReq = env.copy_in_req()?; @@ -3866,13 +4008,72 @@ fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { let token = state.management_lock.lock(); { let mut devs = token.devs.write(); - devs.mcast_subscribe(&req.port_name, req.group)?; + // Subscriptions are keyed on the underlay (outer) IPv6 multicast address. + // If the caller supplied an overlay group, translate it via the M2P table. + // First, reject non-multicast inputs to preserve DevMap error semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + let group_key = match req.group { + oxide_vpc::api::IpAddr::Ip6(ip6) => { + // If an overlay->underlay mapping exists, use it; otherwise, if the + // provided address is already an admin-scoped multicast (ff04::/16), + // accept it as-is. Otherwise, reject. + if let Some(mu) = + state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) + { + oxide_vpc::api::IpAddr::Ip6(mu.0) + } else if ip6.is_admin_scoped_multicast() { + oxide_vpc::api::IpAddr::Ip6(ip6) + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv6 multicast group".into(), + )); + } + } + oxide_vpc::api::IpAddr::Ip4(_v4) => { + // IPv4 overlay groups must have an M2P mapping; the subscription key + // is the underlay IPv6 multicast. Without a mapping, reject with + // a clear message (callers may rely on this distinction). + if let Some(mu) = state.m2p.get(&req.group) { + oxide_vpc::api::IpAddr::Ip6(mu.0) + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv4 multicast group".into(), + )); + } + } + }; + + devs.mcast_subscribe(&req.port_name, group_key)?; + + // DTrace: subscribe + let (af, group_ptr): (usize, uintptr_t) = match group_key { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + } + }; + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__subscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } refresh_maps( devs, token .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); } @@ -3890,13 +4091,102 @@ fn mcast_unsubscribe_hdlr( let token = state.management_lock.lock(); { let mut devs = token.devs.write(); - devs.mcast_unsubscribe(&req.port_name, req.group)?; + + // Verify the port exists, maintaining consistency with other operations + // and ensures we're not silently accepting operations on non-existent + // ports. This check happens before M2P translation to provide clear + // error semantics. + if devs.get_by_name(&req.port_name).is_none() { + return Err(OpteError::PortNotFound(req.port_name.clone())); + } + + // Reject non-multicast input to preserve API use and match subscribe + // semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe, if no M2P mapping exists, we return success (no-op). + // This makes unsubscribe idempotent and handles cleanup race conditions + // where M2P mappings may be removed before unsubscribe is called. + let group_key = + match req.group { + oxide_vpc::api::IpAddr::Ip6(ip6) => { + if let Some(mu) = + state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) + { + oxide_vpc::api::IpAddr::Ip6(mu.0) + } else { + // For IPv6 without M2P mapping, we can't determine the + // exact underlay address due to Omicron's XOR folding. + // `External` IPv6 addresses are mapped to different + // underlay IPv6 addresses (both in ff04::/16 but + // different values). Without the mapping, we return + // success. The subscription was either never created + // (because subscribe would have failed without M2P) + // or was already cleaned up when the M2P was removed. + refresh_maps( + devs, + token.underlay.as_ref().expect( + "underlay must exist while ports exist", + ), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + } + } + oxide_vpc::api::IpAddr::Ip4(_v4) => { + if let Some(mu) = state.m2p.get(&req.group) { + oxide_vpc::api::IpAddr::Ip6(mu.0) + } else { + // For IPv4 without M2P mapping, we can't determine the underlay + // group, but we should still succeed (idempotent cleanup). + // Since subscriptions use underlay IPv6 addresses as keys, + // and we don't know what that would have been, we simply + // return success. The subscription was either never created + // (because subscribe would have failed without M2P) or was + // already cleaned up when the M2P was removed. + refresh_maps( + devs, + token.underlay.as_ref().expect( + "underlay must exist while ports exist", + ), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + } + } + }; + + devs.mcast_unsubscribe(&req.port_name, group_key)?; + // DTrace: unsubscribe + let (af, group_ptr): (usize, uintptr_t) = match group_key { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + } + }; + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__unsubscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } refresh_maps( devs, token .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); } From c9f795711ebe6b5782c31f63723166e315fa3889 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Mon, 17 Nov 2025 03:50:59 +0000 Subject: [PATCH 7/7] [review] Address comments Updates: - Removes unnecessary gateway outbound rules - Moves VNI validation to `DecapAction` - Refactors similar code around router predicates, removing unnecessary checks, etc - `mcast_fwd` (`KRwLock>`) now lives in `XdeDev` (Tx path) only - Reverted from Arc-cloning pattern to holding read locks: * Tx: Acquire and hold per-port `DevMap` and `mcast_fwd` read locks for duration of packet processing (lazy acquisition on first multicast packet) * Rx: Hold per-CPU `DevMap mutex` for duration of packet processing * `refresh_maps()` acquires write locks to update all per-port and per-CPU snapshots, blocking until no Tx/Rx context holds read locks, ensuring safe port teardown * Removed per-CPU cache clearing in `clear_xde_underlay()` * Removed EBUSY retry logic from test teardown (`Xde::drop`) - Use `MulticastUnderlay` newtype across the codebase for mcast underlay address types - Refactored `find_mcast_option_offset()` to use ingot parsing: * Uses `ValidGeneve::parse()`, `OxideOptions::from_raw()` instead of manual byte parsing * Implemented `HeaderLen` for `GeneveOptionParse` to enable `opt.packet_length()` - Use AF_INET and AF_INET6 constants in DTrace probes instead of hardcoded values (2usize, 26usize) - opteadm subscription management commands (mcast-subscribe, mcast-unsubscribe, mcast-unsubscribe-all) with clap integration - Added `McastUnsubscribeAll` API command and ioctl - Documentation updates --- README.adoc | 1 + bin/opteadm/src/bin/opteadm.rs | 88 ++- crates/opte-api/src/cmd.rs | 2 + crates/opte-api/src/ip.rs | 125 +++- dtrace/README.adoc | 2 +- dtrace/opte-mcast-delivery.d | 41 +- lib/opte-ioctl/src/lib.rs | 10 + lib/opte-test-utils/src/lib.rs | 27 +- lib/opte/README.adoc | 12 +- lib/opte/src/api.rs | 6 + lib/opte/src/ddi/sync.rs | 26 - lib/opte/src/engine/geneve.rs | 9 + lib/oxide-vpc/src/api.rs | 51 +- lib/oxide-vpc/src/engine/gateway/mod.rs | 81 +- lib/oxide-vpc/src/engine/geneve.rs | 16 +- lib/oxide-vpc/src/engine/overlay.rs | 39 +- lib/oxide-vpc/src/engine/router.rs | 41 +- lib/oxide-vpc/tests/integration_tests.rs | 8 +- rustfmt.toml | 1 + xde-tests/src/lib.rs | 36 +- xde-tests/tests/multicast_multi_sub.rs | 72 +- xde-tests/tests/multicast_rx.rs | 41 +- xde-tests/tests/multicast_validation.rs | 152 +++- xde/src/dev_map.rs | 67 +- xde/src/stats.rs | 10 +- xde/src/xde.rs | 897 +++++++++-------------- 26 files changed, 987 insertions(+), 874 deletions(-) diff --git a/README.adoc b/README.adoc index 47865d64..c6418e08 100644 --- a/README.adoc +++ b/README.adoc @@ -42,6 +42,7 @@ More detail on our benchmarks can be found in xref:bench/README.adoc[bench/READM * https://rfd.shared.oxide.computer/rfd/0009[RFD 9: Networking Considerations] * https://rfd.shared.oxide.computer/rfd/0021[RFD 21: User Networking API] * https://rfd.shared.oxide.computer/rfd/0063[RFD 63: Network Architecture] +* https://rfd.shared.oxide.computer/rfd/488[RFD 488: Multicast] * https://www.microsoft.com/en-us/research/wp-content/uploads/2017/03/vfp-nsdi-2017-final.pdf[Microsoft's VFP] == Directory Index diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index c8a87727..67cd2dc7 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -16,6 +16,7 @@ use opte::api::Ipv4Addr; use opte::api::Ipv6Addr; use opte::api::MAJOR_VERSION; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::Vni; use opte::print::print_layer; use opte::print::print_list_layers; @@ -41,6 +42,9 @@ use oxide_vpc::api::FirewallRule; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Cfg; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; @@ -234,25 +238,25 @@ enum Command { /// Set a multicast forwarding entry /// - /// Adds or updates a next-hop for the specified underlay multicast address. - /// Multiple next-hops can be configured for the same underlay address by + /// Adds or updates a next hop for the specified underlay multicast address. + /// Multiple next hops can be configured for the same underlay address by /// running this command multiple times (like `swadm route add`). If the - /// same next-hop is specified again, its replication mode is updated. + /// same next hop is specified again, its replication mode is updated. /// /// OPTE routes to `next_hop` (unicast switch address) to determine which /// underlay port to use, then sends the packet to underlay (multicast) with - /// multicast MAC. The switch matches the on outer dst IP (multicast) and + /// multicast MAC. The switch matches the outer dst IP (multicast) and /// Geneve replication tag. SetMcastFwd { /// The underlay multicast IPv6 address (admin-local scope ff04::/16). /// This is the outer IPv6 destination in transmitted packets. - underlay: Ipv6Addr, + underlay: MulticastUnderlay, /// The unicast IPv6 address of the switch for routing (e.g., fd00::1). /// OPTE uses this to determine which underlay port to use via the - /// illumos routing table. Multiple next-hops can be added by + /// illumos routing table. Multiple next hops can be added by /// running this command multiple times with the same underlay address. next_hop: Ipv6Addr, - /// TX-only replication instruction (tells the switch which port groups to use): + /// Tx-only replication instruction (tells the switch which port groups to use): /// - External: front panel ports (decapped, egress to external networks) /// - Underlay: sled-to-sled ports (underlay multicast replication) /// - Both: both external and underlay (bifurcated) @@ -265,7 +269,7 @@ enum Command { /// Clear a multicast forwarding entry ClearMcastFwd { /// The underlay multicast IPv6 address (admin-local scope ff04::/16) - underlay: Ipv6Addr, + underlay: MulticastUnderlay, }, /// Dump the multicast forwarding table @@ -274,6 +278,51 @@ enum Command { /// Dump multicast subscriptions (group -> ports on this sled) DumpMcastSubs, + /// Subscribe a port to a multicast group + /// + /// Allows a port to receive multicast traffic for the specified group. + /// The group address is an overlay multicast address which is translated + /// to an underlay IPv6 multicast address via the M2P (Multicast-to-Physical) + /// mapping table. + /// + /// Subscriptions are local to this sled and control Rx (receive). For Tx + /// (transmit), configure multicast forwarding via set-mcast-fwd. + McastSubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe a port from a multicast group + /// + /// Removes a port's subscription to a multicast group, preventing it from + /// receiving traffic for that group. This is the inverse of mcast-subscribe. + /// + /// If the M2P mapping for the group has already been removed, this operation + /// succeeds as a no-op. + McastUnsubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe all ports from a multicast group + /// + /// Removes all port subscriptions for a given multicast group on this sled + /// in a single operation. This comes in handy for decommissioning a + /// multicast group entirely on this sled. + /// + /// If the M2P mapping for the group has already been removed, this + /// operation succeeds as a no-op. + McastUnsubscribeAll { + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -814,7 +863,7 @@ fn main() -> anyhow::Result<()> { } Command::SetMcastFwd { underlay, next_hop, replication } => { - // OPTE routes to the next-hop's unicast address to determine which + // OPTE routes to the next hop's unicast address to determine which // underlay port to use via the illumos routing table and DDM. // // The packet is then sent to the multicast address with a multicast @@ -826,12 +875,12 @@ fn main() -> anyhow::Result<()> { // - Underlay: underlay ports (sleds) // - Both: both (bifurcated) // - // The Replication type is TX-only, RX ignores it and delivers + // The Replication type is Tx-only, Rx ignores it and delivers // locally based on subscriptions. // // Like `swadm route add`, this command can be run multiple times - // with the same underlay address to add multiple next-hops. If the - // same next-hop is specified again, its replication mode is updated. + // with the same underlay address to add multiple next hops. If the + // same next hop is specified again, its replication mode is updated. // Always use fleet-wide DEFAULT_MULTICAST_VNI let next_hop_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); @@ -856,6 +905,21 @@ fn main() -> anyhow::Result<()> { print_mcast_subs(&hdl.dump_mcast_subs()?)?; } + Command::McastSubscribe { port, group } => { + let req = McastSubscribeReq { port_name: port, group }; + hdl.mcast_subscribe(&req)?; + } + + Command::McastUnsubscribe { port, group } => { + let req = McastUnsubscribeReq { port_name: port, group }; + hdl.mcast_unsubscribe(&req)?; + } + + Command::McastUnsubscribeAll { group } => { + let req = McastUnsubscribeAllReq { group }; + hdl.mcast_unsubscribe_all(&req)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index 5f8969f7..d69a0a8a 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -58,6 +58,7 @@ pub enum OpteCmd { SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) ClearMcast2Phys = 106, // clear M2P mapping DumpMcastSubscriptions = 107, // dump multicast subscription table + McastUnsubscribeAll = 108, // unsubscribe all ports from a multicast group } impl TryFrom for OpteCmd { @@ -98,6 +99,7 @@ impl TryFrom for OpteCmd { 105 => Ok(Self::SetMcast2Phys), 106 => Ok(Self::ClearMcast2Phys), 107 => Ok(Self::DumpMcastSubscriptions), + 108 => Ok(Self::McastUnsubscribeAll), _ => Err(()), } } diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 41e93551..28480f23 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -141,7 +141,7 @@ impl Display for DhcpReplyType { } } -/// Map a subnet to its next-hop. +/// Map a subnet to its next hop. #[derive(Clone, Copy, Debug)] pub struct SubnetRouterPair { pub subnet: Ipv4Cidr, @@ -660,8 +660,11 @@ impl Ipv6Addr { /// - First byte: 0xFF (all multicast addresses) /// - Second byte: 0x04 (flags=0, scope=4 admin-local) /// - /// See [RFC 7346] for details on IPv6 multicast address scopes and - /// how Omicron uses this specific address scope. + /// See [RFC 7346] for details on IPv6 multicast address scopes. + /// + /// Omicron allocates multicast addresses from a /64 subnet within ff04::/16 + /// for underlay multicast traffic. Specific underlay IPv6 addresses are sent + /// from Omicron, with uniqueness guaranteed within the allocated /64 subnet. /// /// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346.html pub const fn is_admin_scoped_multicast(&self) -> bool { @@ -834,6 +837,92 @@ impl Deref for Ipv6Addr { } } +/// Newtype for underlay IPv6 multicast addresses. +/// +/// This newtype wraps admin-scoped (ff04::/16) IPv6 multicast addresses +/// used for underlay multicast delivery. +#[derive( + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Hash, + Serialize, + Deserialize, +)] +#[serde(try_from = "Ipv6Addr", into = "Ipv6Addr")] +pub struct MulticastUnderlay(Ipv6Addr); + +impl MulticastUnderlay { + /// Create a new `MulticastUnderlay` from an IPv6 address. + /// + /// Returns an error if the address is not an admin-scoped multicast address + /// (ff04::/16 prefix). + pub fn new(addr: Ipv6Addr) -> Result { + if !addr.is_admin_scoped_multicast() { + return Err(format!( + "address must be admin-scoped IPv6 multicast (ff04::/16), got: {addr}" + )); + } + Ok(Self(addr)) + } + + /// Create a new `MulticastUnderlay` without validation. + /// + /// Safety: The caller must ensure that `addr` is an admin-scoped IPv6 + /// multicast address (ff04::/16). Using this with an invalid address + /// violates the type's invariant and may lead to undefined behavior. + /// + /// This is intended for cases where validation has already been performed + /// (e.g., after an explicit `is_admin_scoped_multicast()` check) to avoid + /// redundant validation overhead. + #[inline] + pub const fn new_unchecked(addr: Ipv6Addr) -> Self { + Self(addr) + } + + /// Get the inner IPv6 address. + pub fn addr(&self) -> Ipv6Addr { + self.0 + } +} + +impl FromStr for MulticastUnderlay { + type Err = String; + + /// Parse an IPv6 address string and validate it's admin-scoped multicast. + /// + /// Returns an error if the address is not a valid IPv6 address or if it's + /// not an admin-scoped multicast address (ff04::/16). + fn from_str(val: &str) -> result::Result { + let addr = val.parse::()?; + Self::new(addr) + } +} + +impl Display for MulticastUnderlay { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl TryFrom for MulticastUnderlay { + type Error = String; + + fn try_from(addr: Ipv6Addr) -> result::Result { + Self::new(addr) + } +} + +impl From for Ipv6Addr { + fn from(underlay: MulticastUnderlay) -> Self { + underlay.0 + } +} + /// An IPv4 or IPv6 CIDR. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub enum IpCidr { @@ -1570,4 +1659,34 @@ mod test { domain_no_host.push_fqdn(&mut space); assert!(space.is_empty()); } + + #[test] + fn test_multicast_underlay_serde() { + // Test valid admin-scoped address (ff04::/16) + let valid_addr = to_ipv6("ff04::1"); + let underlay = MulticastUnderlay::new(valid_addr).unwrap(); + + // Serialize with postcard (the serialization format used in opte-api) + let serialized = postcard::to_allocvec(&underlay).unwrap(); + + // Deserialize - should succeed + let deserialized: MulticastUnderlay = + postcard::from_bytes(&serialized).unwrap(); + assert_eq!(deserialized.addr(), valid_addr); + + // Test invalid address (not admin-scoped) - should fail deserialization + let invalid_addr = to_ipv6("ff05::1"); // site-local, not admin-scoped + let serialized_invalid = postcard::to_allocvec(&invalid_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_invalid); + assert!(result.is_err()); + + // Test non-multicast address - should fail deserialization + let non_mcast_addr = to_ipv6("fd00::1"); + let serialized_non_mcast = + postcard::to_allocvec(&non_mcast_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_non_mcast); + assert!(result.is_err()); + } } diff --git a/dtrace/README.adoc b/dtrace/README.adoc index 237e67c1..276672bf 100644 --- a/dtrace/README.adoc +++ b/dtrace/README.adoc @@ -67,7 +67,7 @@ a|`opte-tcp-flow-state.d` transition as well as the flow ID. a|`opte-mcast-delivery.d` -|Track multicast TX/RX, local same-sled delivery, underlay forwarding, and +|Track multicast Tx/Rx, local same-sled delivery, underlay forwarding, and external forwarding. Also tracks multicast control-plane operations (map set/clear, fwd set/clear, subscribe/unsubscribe, and dumps) to help correlate config changes with dataplane events. Optional toggles are in the script's diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d index 8c6b8d83..7ed9d3c6 100644 --- a/dtrace/opte-mcast-delivery.d +++ b/dtrace/opte-mcast-delivery.d @@ -6,7 +6,7 @@ * * Configuration (set in BEGIN block): * suppress_output = 1 - Suppress per-event output, show only aggregations - * flow_debug = 1 - Enable multicast TX/RX function entry/exit tracing + * flow_debug = 1 - Enable multicast Tx/Rx function entry/exit tracing * show_summary = 1 - Show aggregated summary at END (default: enabled) */ #include "common.h" @@ -29,14 +29,15 @@ /* * OPTE command numbers for multicast-related ioctls (see crates/opte-api/src/cmd.rs). */ -#define CMD_SET_MCAST_FWD 100 -#define CMD_CLEAR_MCAST_FWD 101 -#define CMD_DUMP_MCAST_FWD 102 -#define CMD_MCAST_SUBSCRIBE 103 -#define CMD_MCAST_UNSUBSCRIBE 104 -#define CMD_SET_M2P 105 -#define CMD_CLEAR_M2P 106 -#define CMD_DUMP_MCAST_SUBS 107 +#define CMD_SET_MCAST_FWD 100 +#define CMD_CLEAR_MCAST_FWD 101 +#define CMD_DUMP_MCAST_FWD 102 +#define CMD_MCAST_SUBSCRIBE 103 +#define CMD_MCAST_UNSUBSCRIBE 104 +#define CMD_SET_M2P 105 +#define CMD_CLEAR_M2P 106 +#define CMD_DUMP_MCAST_SUBS 107 +#define CMD_MCAST_UNSUBSCRIBE_ALL 108 BEGIN { flow_debug = 0; /* Set to 1 to enable detailed flow debugging */ @@ -59,7 +60,7 @@ BEGIN printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); } -/* Multicast TX function entry/exit (optional detailed debugging) */ +/* Multicast Tx function entry/exit (optional detailed debugging) */ xde_mc_tx:entry /flow_debug/ { @@ -220,6 +221,7 @@ xde_ioc_opte_cmd:entry this->cmd == CMD_DUMP_MCAST_SUBS ? "CFG DUMP_SUBS" : this->cmd == CMD_MCAST_SUBSCRIBE ? "CFG SUBSCRIBE" : this->cmd == CMD_MCAST_UNSUBSCRIBE ? "CFG UNSUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE_ALL ? "CFG UNSUB_ALL" : NULL; /* Always track aggregations for multicast ops */ @@ -342,6 +344,23 @@ mcast-unsubscribe printf(M_LINE_FMT, "UNSUBSCR", this->vni, this->group, this->port); } +mcast-unsubscribe-all { + /* arg0=af, arg1=group_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["UNSUB_ALL"] = count(); +} + +mcast-unsubscribe-all +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUB_ALL", this->vni, this->group, "ALL"); +} + /* Dataplane failure probes */ mcast-tx-pullup-fail { /* arg0=len */ @@ -401,7 +420,7 @@ END printa(@by_underlay); printf("\nLocal delivery by port:\n"); printa(@by_port); - printf("\nForwarding by unicast next-hop (routing address):\n"); + printf("\nForwarding by unicast next hop (routing address):\n"); printa(@by_nexthop_unicast); printf("\nConfig ops:\n"); printa(@cfg_counts); diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index 0adb9935..510fc9a1 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -43,6 +43,7 @@ use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrReq; @@ -290,6 +291,15 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) } + /// Unsubscribe all ports from a multicast group. + pub fn mcast_unsubscribe_all( + &self, + req: &McastUnsubscribeAllReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribeAll; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + /// Set xde underlay devices. pub fn set_xde_underlay( &self, diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index 3fc8cd2d..bb128b44 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -405,23 +405,21 @@ pub fn oxide_net_setup2( // // * ARP Gateway MAC addr // * ICMP Echo Reply for Gateway - // * DHCP Offer - // * DHCP Ack - // * Outbound unicast traffic from Guest IP + MAC address - // * Outbound multicast traffic from Guest IP + MAC address + // * DHCP Discover → Offer hairpin + // * DHCP Request → Ack hairpin + // * Outbound no-spoof from Guest IP + MAC (allows unicast and multicast) // // IPv6 // ---- // - // * NDP NA for Gateway - // * NDP RA for Gateway - // * Deny all other NDP - // * ICMPv6 Echo Reply for Gateway from Guest Link-Local // * ICMPv6 Echo Reply for Gateway from Guest VPC ULA + // * ICMPv6 Echo Reply for Gateway from Guest Link-Local + // * NDP RA for Gateway + // * NDP NA for Gateway // * DHCPv6 - // * Outbound unicast traffic from Guest IPv6 + MAC Address - // * Outbound multicast traffic from Guest IPv6 + MAC Address - "set:gateway.rules.out=14", + // * Deny all other NDP + // * Outbound no-spoof from Guest IPv6 + MAC (allows unicast and multicast) + "set:gateway.rules.out=12", // * Allow all outbound traffic "set:firewall.rules.out=0", // * Outbound IPv4 SNAT @@ -444,13 +442,12 @@ pub fn oxide_net_setup2( }); updates.extend_from_slice(&[ - // * IPv4 multicast passthrough - // * IPv6 multicast passthrough + // * Multicast passthrough (handles both IPv4 and IPv6) // * Allow guest to route to own subnet - "set:router.rules.out=3", + "set:router.rules.out=2", // * Outbound encap // * Inbound decap - // * Inbound mcast-vni-validator + // * Inbound VNI validator (multicast) "set:overlay.rules.in=2, overlay.rules.out=1", ]); diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 02854ef6..c309a2f0 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -230,9 +230,9 @@ fleet-level VNI.) ==== Delivery Modes and Replication -The `Replication` type is a TX‑only instruction to switches encoded in the Oxide Geneve +The `Replication` type is a Tx‑only instruction to switches encoded in the Oxide Geneve multicast option as a 2‑bit field in the top two bits of the option body's first byte. -It tells the switch which ports to replicate the frame to on transmission. On RX, OPTE +It tells the switch which ports to replicate the frame to on transmission. On Rx, OPTE ignores the replication field and performs local same‑sled delivery based purely on subscriptions. The replication mode is not an access control mechanism. @@ -253,9 +253,9 @@ address from M2P with multicast MAC (RFC 2464). All multicast uses fleet VNI 77. ==== Encapsulation Path -On TX, the overlay layer encapsulates packets destined for multicast groups +On Tx, the overlay layer encapsulates packets destined for multicast groups with a Geneve multicast option initially set to `External` replication mode. -XDE's multicast TX path (`xde_mc_tx`) first delivers the packet locally to +XDE's multicast Tx path (`xde_mc_tx`) first delivers the packet locally to all other ports on the same sled that have subscribed to the multicast group (within the same VNI), then consults the multicast forwarding table. @@ -264,9 +264,9 @@ Geneve multicast option to match that next hop's configured replication mode. XDE routes to the next hop's unicast address (for all replication modes) to determine reachability and which underlay port/MAC to use. The packet destination (outer IPv6) is the multicast address from M2P with multicast MAC (RFC 2464). The Geneve replication -option serves as a TX-only instruction telling switches which port groups to replicate to. +option serves as a Tx-only instruction telling switches which port groups to replicate to. -==== RX Behavior +==== Rx Behavior OPTE acts as a leaf node and does not relay multicast traffic received from the underlay. diff --git a/lib/opte/src/api.rs b/lib/opte/src/api.rs index d5d9431f..199fa8b5 100644 --- a/lib/opte/src/api.rs +++ b/lib/opte/src/api.rs @@ -278,3 +278,9 @@ pub type DumpLayerResp = opte_api::DumpLayerResp; pub type DumpUftResp = opte_api::DumpUftResp; pub type DumpTcpFlowsResp = opte_api::DumpTcpFlowsResp; pub type TcpFlowEntryDump = opte_api::TcpFlowEntryDump; + +// Implement ResourceEntry for MulticastUnderlay when the engine feature is enabled. +// This allows MulticastUnderlay to be used as a MappingResource::Entry in the +// Mcast2Phys table (see oxide-vpc/engine/overlay.rs). +#[cfg(feature = "engine")] +impl crate::engine::rule::ResourceEntry for MulticastUnderlay {} diff --git a/lib/opte/src/ddi/sync.rs b/lib/opte/src/ddi/sync.rs index 817f8857..70397094 100644 --- a/lib/opte/src/ddi/sync.rs +++ b/lib/opte/src/ddi/sync.rs @@ -780,29 +780,3 @@ impl Drop for TokenGuard<'_, T> { assert_eq!(Some(curthread), lock_thread); } } - -/// Clone an Arc from behind a RwLock, dropping the read lock immediately. -/// -/// This pattern is used throughout the datapath to make readers lock-free -/// while keeping snapshots alive via Arc refcounting. The brief lock hold -/// (just the Arc clone) minimizes contention and avoids blocking management -/// operations like `refresh_maps()`. -#[inline(always)] -pub fn clone_from_rwlock( - lock: &KRwLock>, -) -> alloc::sync::Arc { - alloc::sync::Arc::clone(&*lock.read()) -} - -/// Clone an Arc from behind a Mutex, dropping the lock immediately. -/// -/// This pattern is used throughout the datapath to make readers lock-free -/// while keeping snapshots alive via Arc refcounting. The brief lock hold -/// (just the Arc clone) minimizes contention and avoids blocking management -/// operations like `refresh_maps()`. -#[inline(always)] -pub fn clone_from_mutex( - lock: &KMutex>, -) -> alloc::sync::Arc { - alloc::sync::Arc::clone(&*lock.lock()) -} diff --git a/lib/opte/src/engine/geneve.rs b/lib/opte/src/engine/geneve.rs index 7f5e958e..382f8927 100644 --- a/lib/opte/src/engine/geneve.rs +++ b/lib/opte/src/engine/geneve.rs @@ -383,6 +383,15 @@ impl<'a, T: OptionCast<'a>> GeneveOptionParse<'a, T> { } } +impl<'a, T: OptionCast<'a>> HeaderLen for GeneveOptionParse<'a, T> { + const MINIMUM_LENGTH: usize = GeneveOpt::MINIMUM_LENGTH; + + fn packet_length(&self) -> usize { + // Option header (4 bytes) + body length (already padded to 4-byte boundary) + GeneveOpt::MINIMUM_LENGTH + self.body_remainder.len() + } +} + /// Marks whather a Geneve option has been successfuly interpreted as a known /// variant. pub enum Known { diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index ba1563ac..8c67ec25 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,10 +20,10 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; -/// TX-only instruction to switches for multicast packet replication. +/// Tx-only instruction to switches for multicast packet replication. /// /// Tells the switch which port groups to replicate outbound multicast packets -/// to. It is a transmit-only setting - on RX, OPTE ignores the replication +/// to. It is a transmit-only setting - on Rx, OPTE ignores the replication /// field and performs local same-sled delivery based purely on subscriptions. /// The replication mode is not an access control mechanism. /// @@ -38,8 +38,9 @@ use uuid::Uuid; /// - `Underlay`: Switch replicates to underlay ports (other sleds) only /// - `Both`: Switch replicates to both external and underlay ports (bifurcated) /// -/// Encoding: The Geneve Oxide multicast option encodes the replication strategy in the -/// top 2 bits of the option body's first byte (u2). The remaining 30 bits are reserved. +/// Encoding: The Geneve Oxide multicast option encodes the replication strategy +/// in the top 2 bits of the option body's first byte (u2). The remaining 30 +/// bits are reserved. /// /// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) /// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). @@ -384,7 +385,7 @@ pub struct PhysNet { /// OPTE routes to [`NextHopV6::addr`] (the switch's unicast address) for all /// replication modes to determine reachability and which underlay port/MAC to /// use. The packet destination (outer IPv6) is always the multicast address -/// from M2P. The associated [`Replication`] mode is a TX-only instruction +/// from M2P. The associated [`Replication`] mode is a Tx-only instruction /// telling the switch which port groups to replicate to on transmission. /// Routing is always to the unicast next hop. #[derive( @@ -536,7 +537,7 @@ impl Display for RouterTarget { pub enum RouterClass { /// The rule belongs to the shared VPC-wide router. System, - /// The rule belongs to the subnet-specific router, and has precendence + /// The rule belongs to the subnet-specific router, and has precedence /// over a `System` rule of equal priority. Custom, } @@ -686,8 +687,8 @@ pub struct ClearVirt2PhysReq { pub struct SetMcast2PhysReq { /// Overlay multicast group address pub group: IpAddr, - /// Underlay IPv6 multicast address - pub underlay: Ipv6Addr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, } /// Clear a mapping from multicast group to underlay multicast address. @@ -697,8 +698,8 @@ pub struct SetMcast2PhysReq { pub struct ClearMcast2PhysReq { /// Overlay multicast group address pub group: IpAddr, - /// Underlay IPv6 multicast address - pub underlay: Ipv6Addr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, } /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. @@ -745,7 +746,7 @@ pub enum DelRouterEntryResp { /// /// Configures how OPTE forwards multicast packets for a specific underlay group. /// The forwarding table maps underlay multicast addresses to switch endpoints -/// and TX-only replication instructions. +/// and Tx-only replication instructions. /// /// Routing vs destination: OPTE routes to [`NextHopV6::addr`] (switch's unicast /// address) to determine reachability and which underlay port/MAC to use. The @@ -758,10 +759,11 @@ pub enum DelRouterEntryResp { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetMcastForwardingReq { /// The underlay IPv6 multicast address (outer IPv6 dst in transmitted packets) - pub underlay: Ipv6Addr, - /// Switch endpoints and TX-only replication instructions. + /// Must be admin-scoped ff04::/16 + pub underlay: MulticastUnderlay, + /// Switch endpoints and Tx-only replication instructions. /// Each NextHopV6.addr is the unicast IPv6 of a switch (for routing). - /// The Replication is a TX-only instruction indicating which port groups + /// The Replication is a Tx-only instruction indicating which port groups /// the switch should use. pub next_hops: Vec<(NextHopV6, Replication)>, } @@ -769,8 +771,8 @@ pub struct SetMcastForwardingReq { /// Clear multicast forwarding entries for an underlay multicast group. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct ClearMcastForwardingReq { - /// The underlay IPv6 multicast address - pub underlay: Ipv6Addr, + /// The underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, } /// Response for dumping the multicast forwarding table. @@ -785,9 +787,9 @@ impl CmdOk for DumpMcastForwardingResp {} /// A single multicast forwarding table entry. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct McastForwardingEntry { - /// The underlay IPv6 multicast address - pub underlay: Ipv6Addr, - /// The next hops (underlay IPv6 addresses) with TX-only replication instructions + /// The underlay IPv6 multicast address (admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, + /// The next hops (underlay IPv6 addresses) with Tx-only replication instructions pub next_hops: Vec<(NextHopV6, Replication)>, } @@ -804,8 +806,8 @@ impl CmdOk for DumpMcastSubscriptionsResp {} /// A single multicast subscription entry. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct McastSubscriptionEntry { - /// The underlay IPv6 multicast address (subscription key) - pub underlay: Ipv6Addr, + /// The underlay IPv6 multicast address (admin-scoped ff04::/16, subscription key) + pub underlay: MulticastUnderlay, /// Port names subscribed to this group on this sled pub ports: Vec, } @@ -828,6 +830,13 @@ pub struct McastUnsubscribeReq { pub group: IpAddr, } +/// Unsubscribe all ports from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeAllReq { + /// The multicast group address + pub group: IpAddr, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index b6f8504a..b3ad7d4a 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -39,6 +39,21 @@ //! # Link-Local IPv6 //! //! No IPv6 link-local traffic should ever make it past this layer. +//! +//! # Multicast Traffic +//! +//! The gateway layer allows both unicast and multicast traffic through +//! the no-spoof rules (outbound) and separate inbound rules: +//! +//! - Outbound: The no-spoof rule matches on source IP/MAC but has no +//! destination IP predicate, so it permits multicast destinations. This +//! allows guests to send to any multicast group address at the gateway +//! layer. However, the overlay layer enforces M2P (Multicast-to-Physical) +//! mappings, denying packets for unconfigured multicast groups. +//! +//! - Inbound: Separate rules (IPv4 224.0.0.0/4 and IPv6 ff00::/8) +//! allow multicast packets to reach guests and rewrite the source MAC +//! to the gateway MAC, similar to unicast traffic. use crate::api::DhcpCfg; use crate::api::MacAddr; @@ -175,7 +190,17 @@ fn setup_ipv4( let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -198,25 +223,8 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); - // IPv4 multicast prefixes (224.0.0.0/4) - let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; - - // Outbound IPv4 multicast - allow from guest's MAC to multicast destinations. - // - // NOTE: This unconditionally allows any dst IP in 224.0.0.0/4 (all IPv4 multicast). - // The overlay layer enforces M2P mappings and underlay address validation. - // - // Because these gateway rules are unconditional (no destination filtering), - // custom firewall routes can target ANY IP range to a multicast group, - // enabling intra-VPC use cases. - let mut mcast_out_v4 = Rule::new(1001, Action::Meta(vpc_meta.clone())); - mcast_out_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast.clone())); - mcast_out_v4.add_predicate(Predicate::InnerEtherSrc(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), - ])); - layer.add_rule(Direction::Out, mcast_out_v4.finalize()); - // Inbound IPv4 multicast - rewrite source MAC to gateway and allow + let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; // This mirrors the IPv6 multicast inbound rule to ensure multicast // delivery to guests is permitted by the gateway layer. let mut mcast_in_v4 = Rule::new( @@ -244,7 +252,18 @@ fn setup_ipv6( icmpv6::setup(layer, cfg, ip_cfg)?; dhcpv6::setup(layer, cfg, dhcp_cfg)?; let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); - let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta.clone())); + + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. + let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), ])); @@ -267,28 +286,8 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); - // IPv6 multicast prefix (ff00::/8) - // Allow any overlay multicast address - the underlay (ff04::/16) restriction - // is enforced by M2P mappings and multicast forwarding validation, not here. + // Inbound IPv6 multicast - rewrite source MAC to gateway and allow let ipv6_mcast = vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]; - - // Outbound multicast - allow from guest's MAC to multicast destinations. - // - // NOTE: This unconditionally allows any dst IP in ff00::/8 (all IPv6 multicast). - // The overlay layer enforces M2P mappings, and only ff04::/16 underlay addresses - // are permitted by the M2P validation. - // - // Because these gateway rules are unconditional (no destination filtering), - // custom firewall routes can target ANY IP range to a multicast group, - // enabling intra-VPC use cases. - let mut mcast_out = Rule::new(1001, Action::Meta(vpc_meta.clone())); - mcast_out.add_predicate(Predicate::InnerDstIp6(ipv6_mcast.clone())); - mcast_out.add_predicate(Predicate::InnerEtherSrc(vec![ - EtherAddrMatch::Exact(cfg.guest_mac), - ])); - layer.add_rule(Direction::Out, mcast_out.finalize()); - - // Inbound multicast - rewrite source MAC to gateway let mut mcast_in = Rule::new( 1001, Action::Static(Arc::new(RewriteSrcMac { diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index 08f90812..1136095a 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -40,19 +40,21 @@ //! 11 = Reserved //! ``` //! -//! ### Replication Semantics (TX-only instruction) +//! ### Replication Semantics (Tx-only instruction) //! -//! The [`Replication`] type is a TX-only instruction telling the switch which port groups -//! to replicate outbound multicast packets to. On RX, OPTE ignores the replication field -//! and performs local same-sled delivery based purely on subscriptions. +//! The [`Replication`] type is a Tx-only instruction telling the switch which +//! port groups to replicate outbound multicast packets to. On Rx, OPTE ignores +//! the replication field and performs local same-sled delivery based purely on +//! subscriptions. //! -//! OPTE routes to next hop unicast address (for ALL modes) to determine reachability -//! and underlay port/MAC. Packet destination is multicast ff04::/16 with multicast MAC. +//! OPTE routes to next hop unicast address (for ALL modes) to determine +//! reachability and underlay port/MAC. Packet destination is multicast +//! ff04::/16 with multicast MAC. //! //! - **External**: Switch decaps and replicates to external-facing ports (front panel) //! - **Underlay**: Switch replicates to underlay ports (other sleds) //! - **Both**: Switch replicates to both external and underlay port groups (bifurcated) -//! - **Local same-sled delivery**: Always happens regardless of the TX-only replication setting. +//! - **Local same-sled delivery**: Always happens regardless of the replication setting. //! Not an access control mechanism - local delivery is independent of replication mode. //! //! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index ea81d9e9..a2c8175d 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -32,6 +32,7 @@ use opte::api::Direction; use opte::api::Ipv4Addr; use opte::api::Ipv4Cidr; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::OpteError; use opte::ddi::sync::KMutex; use opte::ddi::sync::KMutexGuard; @@ -249,8 +250,9 @@ impl StaticAction for EncapAction { Some(underlay) => ( true, PhysNet { - ether: underlay.0.unchecked_multicast_mac(), - ip: underlay.0, + // Outer MAC filled in by XDE + ether: MacAddr::ZERO, + ip: underlay.addr(), vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), }, true, @@ -386,7 +388,7 @@ impl StaticAction for EncapAction { // For multicast originated from this host, we seed the multicast Geneve // option with `External` replication. XDE will then select the actual - // replication per next-hop based on the rack-wide forwarding table + // replication per next hop based on the rack-wide forwarding table // (mcast_fwd), which tells the switch which ports to replicate to // (external, underlay, or bifurcated). // @@ -410,8 +412,13 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MCAST_OPT_BODY), }; - let outer_mac = - if is_mcast { phys_target.ether } else { MacAddr::ZERO }; + // For multicast, derive the outer MAC from the IPv6 address per RFC 2464. + // For unicast, XDE fills in the MAC via routing table lookup. + let outer_mac = if is_mcast { + phys_target.ip.unchecked_multicast_mac() + } else { + MacAddr::ZERO + }; let tfrm = HdrTransform { name: ENCAP_NAME.to_string(), @@ -589,7 +596,7 @@ impl StaticAction for DecapAction { /// All outbound multicast packets are currently encapsulated with VNI 77 /// (DEFAULT_MULTICAST_VNI) for fleet-wide delivery. See [`EncapAction::gen_ht`]. /// -/// ## Validation Policy on RX Path +/// ## Validation Policy on Rx Path /// This validator accepts multicast packets with either of two VNI values: /// - **VNI 77 (DEFAULT_MULTICAST_VNI)**: Fleet-wide multicast, accepted by all /// ports regardless of VPC. This enables rack-wide multicast delivery. @@ -1010,12 +1017,20 @@ impl Mcast2Phys { /// Dump all IPv4 overlay multicast group to underlay IPv6 multicast mappings. pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { - self.ip4.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + self.ip4 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() } /// Dump all IPv6 overlay multicast group to underlay IPv6 multicast mappings. pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { - self.ip6.lock().iter().map(|(vip, mcast)| (*vip, mcast.0)).collect() + self.ip6 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() } } @@ -1025,15 +1040,7 @@ impl Default for Mcast2Phys { } } -/// Transparent wrapper for underlay IPv6 multicast addresses. -/// -/// This newtype exists only to satisfy the orphan rule for implementing -/// `ResourceEntry`. Validation is performed at the API boundary (xde.rs). -#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct MulticastUnderlay(pub Ipv6Addr); - impl Resource for Mcast2Phys {} -impl ResourceEntry for MulticastUnderlay {} impl MappingResource for Mcast2Phys { type Key = IpAddr; diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index a42b8120..6f03f892 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -269,25 +269,17 @@ pub fn setup( let mut layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); - // Allow IPv6 multicast (ff00::/8) to bypass route lookup. + // Allow multicast traffic (IPv4 224.0.0.0/4 and IPv6 ff00::/8) to bypass route lookup. // Multicast operates fleet-wide via M2P mappings, not through VPC routing. // The overlay addresses use any valid multicast prefix; underlay restriction // to ff04::/16 is enforced by M2P mapping validation. - let mut mcast_out = - Rule::new(0, Action::Meta(Arc::new(MulticastPassthrough))); - mcast_out.add_predicate(Predicate::InnerDstIp6(vec![ - Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST), + let mut mcast_out = Rule::new(0, Action::Allow); + mcast_out.add_predicate(Predicate::Any(vec![ + Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]), + Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]), ])); layer.add_rule(Direction::Out, mcast_out.finalize()); - // Allow IPv4 multicast (224.0.0.0/4) to bypass route lookup. - let mut mcast_out_v4 = - Rule::new(0, Action::Meta(Arc::new(MulticastPassthrough))); - mcast_out_v4.add_predicate(Predicate::InnerDstIp4(vec![ - Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST), - ])); - layer.add_rule(Direction::Out, mcast_out_v4.finalize()); - pb.add_layer(layer, Pos::After(fw::FW_LAYER_NAME)) } @@ -457,29 +449,6 @@ pub fn replace( Ok(NoResp::default()) } -/// Passthrough action for multicast traffic that bypasses route lookup. -struct MulticastPassthrough; - -impl fmt::Display for MulticastPassthrough { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "multicast-passthrough") - } -} - -impl MetaAction for MulticastPassthrough { - fn implicit_preds(&self) -> (Vec, Vec) { - (vec![], vec![]) - } - - fn mod_meta( - &self, - _flow_id: &InnerFlowId, - _meta: &mut ActionMeta, - ) -> ModMetaResult { - Ok(AllowOrDeny::Allow(())) - } -} - // TODO I may want to have different types of rule/flow tables a layer // can have. Up to this point the tables consist of `Rule` entires; // matching arbitrary header predicates to a `RuleAction`. I may want diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index 8b8a45e5..4c1a8e66 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -507,7 +507,7 @@ fn guest_to_guest_no_route() { RouterClass::System, ) .unwrap(); - update!(g1, ["incr:epoch", "set:router.rules.out=2"]); + update!(g1, ["incr:epoch", "set:router.rules.out=1"]); let mut pkt1_m = http_syn(&g1_cfg, &g2_cfg); let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt1); @@ -4840,7 +4840,8 @@ fn test_ipv6_multicast_encapsulation() { // Add multicast forwarding entry BEFORE starting the port g1.m2p.set( mcast_dst.into(), - oxide_vpc::engine::overlay::MulticastUnderlay(mcast_underlay), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), ); g1.port.start(); @@ -4945,7 +4946,8 @@ fn test_tcp_multicast_denied() { g1.m2p.set( mcast_dst.into(), - oxide_vpc::engine::overlay::MulticastUnderlay(mcast_underlay), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), ); g1.port.start(); diff --git a/rustfmt.toml b/rustfmt.toml index f1d3d2fc..d5d9e9ef 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -4,3 +4,4 @@ max_width = 80 use_small_heuristics = "max" imports_granularity = "Item" style_edition = "2024" +edition = "2024" diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 0b4a25bd..3c9307a3 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -30,6 +30,7 @@ use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::MacAddr; use oxide_vpc::api::McastSubscribeReq; use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::PhysNet; use oxide_vpc::api::Ports; use oxide_vpc::api::RouterClass; @@ -476,33 +477,10 @@ impl Drop for Xde { fn drop(&mut self) { // Clear underlay to release references to simnet/vnic devices, // allowing their cleanup to proceed. Driver remains loaded. - // - // Retry with backoff if EBUSY (in-flight TX may briefly hold refs). - // After cache clearing + siphon quiesce, refs should drain quickly. if let Ok(adm) = OpteHdl::open() { - for attempt in 1..=10 { - match adm.clear_xde_underlay() { - Ok(_) => { - if attempt > 1 { - eprintln!( - "clear_xde_underlay succeeded on attempt {attempt}" - ); - } - return; - } - Err(e) if e.to_string().contains("EBUSY") => { - eprintln!( - "clear_xde_underlay returned EBUSY on attempt {attempt}/10; retrying after 10ms" - ); - std::thread::sleep(Duration::from_millis(10)); - } - Err(e) => { - eprintln!("failed to clear xde underlay: {e}"); - return; - } - } + if let Err(e) = adm.clear_xde_underlay() { + eprintln!("failed to clear xde underlay: {e}"); } - eprintln!("failed to clear xde underlay after 10 retries (EBUSY)"); } } } @@ -604,11 +582,11 @@ pub fn ensure_underlay_admin_scoped_route_v6(interface: &str) -> Result<()> { /// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. pub struct MulticastGroup { pub group: IpAddr, - pub underlay: Ipv6Addr, + pub underlay: MulticastUnderlay, } impl MulticastGroup { - pub fn new(group: IpAddr, underlay: Ipv6Addr) -> Result { + pub fn new(group: IpAddr, underlay: MulticastUnderlay) -> Result { let hdl = OpteHdl::open()?; hdl.set_m2p(&SetMcast2PhysReq { group, underlay })?; Ok(Self { group, underlay }) @@ -762,11 +740,11 @@ pub fn two_node_topology_named( opte0.fw_allow_all()?; // Add a host route to the underlay address of opte0, through the link local - // address of sim0 as a nexthop through sim1. This is facilitating the flow + // address of sim0 as a next hop through sim1. This is facilitating the flow // of traffic from opte1 to opte0. When a packet enters opte1 (from vopte1) // destined for 10.0.0.1, opte will look up the v2p mapping which points to // fd44::1. That is the underlay address of opte0. The route below says: - // that address is reachable through the sim1 interface, with a nexthop of + // that address is reachable through the sim1 interface, with a next hop of // the sim0 interface. In the diagram above, that is the "upward" direction // of our simnet underlay loopback. The xde device uses the kernel's routing // tables to determine which underlay device to use. With this route in diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs index 978b86c3..ab3a0086 100644 --- a/xde-tests/tests/multicast_multi_sub.rs +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -6,15 +6,15 @@ //! XDE multicast multiple subscriber tests. //! -//! These validate TX fanout and forwarding semantics across replication modes: +//! These validate Tx fanout and forwarding semantics across replication modes: //! - Same-sled delivery (DELIVER action) is based purely on subscriptions and -//! independent of Replication mode set for TX. +//! independent of Replication mode set for Tx. //! - External replication sends Geneve to the multicast underlay address for //! delivery to the boundary switch, which then replicates to front-panel ports. //! - Underlay replication sends Geneve to ff04::/16 multicast address for //! sled-to-sled delivery; receiving sleds perform same-sled delivery based on //! local subscriptions. -//! - "Both" replication instructs TX to set bifurcated replication flags +//! - "Both" replication instructs Tx to set bifurcated replication flags //! (External + Underlay) in the Geneve header for switch-side handling, while //! same-sled delivery still occurs independently based on subscriptions. @@ -25,6 +25,7 @@ use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::Replication; use oxide_vpc::api::Vni; @@ -45,10 +46,11 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints - let mcast_underlay = Ipv6Addr::from([ + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 3, - ]); + ])) + .unwrap(); // Set up multicast state with automatic cleanup on drop let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; @@ -61,10 +63,10 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { // This test validates packet formatting, not actual multi-sled routing. let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up TX forwarding with External replication mode. - // TX behavior: packet sent to underlay with Replication::External flag. + // Set up Tx forwarding with External replication mode. + // Tx behavior: packet sent to underlay with Replication::External flag. // In production, switch receives this flag and replicates to front-panel ports. - // RX behavior: same-sled delivery is controlled by subscriptions, independent + // Rx behavior: same-sled delivery is controlled by subscriptions, independent // of the Replication mode. mcast.set_forwarding(vec![( NextHopV6::new(fake_switch_addr, vni), @@ -107,7 +109,7 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; - // Also snoop underlay to verify unicast Geneve TX to boundary + // Also snoop underlay to verify unicast Geneve Tx to boundary let underlay_dev = "xde_test_sim1"; let mut snoop_underlay = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; @@ -166,7 +168,8 @@ fn test_multicast_multiple_local_subscribers() -> Result<()> { DEFAULT_MULTICAST_VNI ); assert_eq!( - geneve_info.outer_ipv6_dst, mcast_underlay, + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), "External replication should use multicast address (outer IPv6 dst)" ); assert_eq!( @@ -189,10 +192,11 @@ fn test_multicast_underlay_replication() -> Result<()> { let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints - let mcast_underlay = Ipv6Addr::from([ + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 4, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; @@ -201,9 +205,9 @@ fn test_multicast_underlay_replication() -> Result<()> { // Use node B's underlay address as the switch unicast address for routing. let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up TX forwarding with Underlay replication mode. - // TX behavior: forward to underlay with multicast encapsulation. - // RX behavior: same-sled delivery to subscribers (none in this test). + // Set up Tx forwarding with Underlay replication mode. + // Tx behavior: forward to underlay with multicast encapsulation. + // Rx behavior: same-sled delivery to subscribers (none in this test). mcast.set_forwarding(vec![( NextHopV6::new(fake_switch_addr, vni), Replication::Underlay, @@ -211,8 +215,8 @@ fn test_multicast_underlay_replication() -> Result<()> { // Allow IPv4 multicast traffic via Multicast target // - // Note: We deliberately do NOT subscribe any nodes. This tests TX forwarding - // with zero local subscribers (RX delivery is based on subscriptions, not + // Note: We deliberately do NOT subscribe any nodes. This tests Tx forwarding + // with zero local subscribers (Rx delivery is based on subscriptions, not // Replication) let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); for node in &topol.nodes { @@ -285,7 +289,8 @@ fn test_multicast_underlay_replication() -> Result<()> { DEFAULT_MULTICAST_VNI ); assert_eq!( - geneve_info.outer_ipv6_dst, mcast_underlay, + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), "Outer IPv6 dst should be multicast underlay address" ); assert_eq!( @@ -295,7 +300,7 @@ fn test_multicast_underlay_replication() -> Result<()> { ); // Verify NO same-sled delivery (no subscribers = no delivery) - // Note: RX delivery is independent of Replication mode - it's based on subscriptions + // Note: Rx delivery is independent of Replication mode - it's based on subscriptions if let Ok(output) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { let stdout = String::from_utf8_lossy(&output.stdout); panic!( @@ -303,16 +308,16 @@ fn test_multicast_underlay_replication() -> Result<()> { ); } - // Leaf-only RX assertion: start a second underlay snoop and ensure there - // is no additional multicast re-relay after RX. We expect only the single - // TX underlay packet captured above. + // Leaf-only Rx assertion: start a second underlay snoop and ensure there + // is no additional multicast re-relay after Rx. We expect only the single + // Tx underlay packet captured above. let mut snoop_underlay_2 = SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; if let Ok(out) = snoop_underlay_2.wait_with_timeout(Duration::from_secs(2)) { let stdout = String::from_utf8_lossy(&out.stdout); panic!( - "Expected leaf-only RX (no further underlay relay), got:\n{stdout}" + "Expected leaf-only Rx (no further underlay relay), got:\n{stdout}" ); } @@ -321,7 +326,7 @@ fn test_multicast_underlay_replication() -> Result<()> { #[test] fn test_multicast_both_replication() -> Result<()> { - // Test "Both" replication mode: validates that egress TX (External + Underlay) + // Test "Both" replication mode: validates that egress Tx (External + Underlay) // and local same-sled delivery both occur. let topol = xde_tests::three_node_topology_named("omicron1", "ara", "arb", "arc")?; @@ -332,21 +337,22 @@ fn test_multicast_both_replication() -> Result<()> { let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints - let mcast_underlay = Ipv6Addr::from([ + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 5, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; // Use node B's underlay address as the switch unicast address for routing. let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up TX forwarding with "Both" replication (drives egress encapsulation only) - // TX behavior: packet sent to underlay with Replication::Both flag set. + // Set up Tx forwarding with "Both" replication (drives egress encapsulation only) + // Tx behavior: packet sent to underlay with Replication::Both flag set. // In production, switch receives this and bifurcates: External (to front panel) // + Underlay (sled-to-sled multicast). - // RX behavior: same-sled local delivery occurs independently, driven purely by + // Rx behavior: same-sled local delivery occurs independently, driven purely by // port subscriptions (not the replication mode). mcast.set_forwarding(vec![( NextHopV6::new(fake_switch_addr, vni), @@ -437,7 +443,8 @@ fn test_multicast_both_replication() -> Result<()> { DEFAULT_MULTICAST_VNI ); assert_eq!( - geneve_info.outer_ipv6_dst, mcast_underlay, + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), "Outer IPv6 dst should be multicast underlay address" ); assert_eq!( @@ -460,10 +467,11 @@ fn test_partial_unsubscribe() -> Result<()> { const MCAST_PORT: u16 = 9999; let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; - let mcast_underlay = Ipv6Addr::from([ + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 6, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs index 69fa4c84..13fd59bb 100644 --- a/xde-tests/tests/multicast_rx.rs +++ b/xde-tests/tests/multicast_rx.rs @@ -4,12 +4,12 @@ // Copyright 2025 Oxide Computer Company -//! XDE multicast RX-path tests. +//! XDE multicast Rx-path tests. //! //! These validate that: -//! - Control-plane config (M2P map + forwarding) drives TX encapsulation only. +//! - Control-plane config (M2P map + forwarding) drives Tx encapsulation only. //! - Same-sled delivery is based purely on subscriptions and is independent of -//! the Replication mode set for TX. +//! the Replication mode set for Tx. //! - Underlay multicast uses admin-local IPv6 (ff04::/16) and routes via the //! host underlay interface. //! - Packets received from the underlay are delivered to subscribed ports and @@ -20,6 +20,7 @@ use opte_ioctl::OpteHdl; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::Replication; use oxide_vpc::api::Vni; @@ -40,7 +41,8 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { // M2P mapping: overlay layer needs IPv6 multicast underlay address // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address - let mcast_underlay: Ipv6Addr = "ff04::e000:fb".parse().unwrap(); + let mcast_underlay = + MulticastUnderlay::new("ff04::e000:fb".parse().unwrap()).unwrap(); // Set up multicast group with automatic cleanup on drop let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; @@ -51,9 +53,9 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { // Note: This is a single-sled test; all nodes share one underlay network. let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up TX forwarding with Underlay replication to test underlay RX path. + // Set up Tx forwarding with Underlay replication to test underlay Rx path. // This causes packets to be sent to the underlay multicast address, then - // received back via the underlay RX path for same-sled delivery. + // received back via the underlay Rx path for same-sled delivery. mcast.set_forwarding(vec![( NextHopV6::new(fake_switch_addr, vni), Replication::Underlay, @@ -95,7 +97,7 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { s_entry.ports ); - // Assert forwarding table contains expected next-hop + replication + // Assert forwarding table contains expected next hop + replication let mfwd = hdl.dump_mcast_fwd()?; let entry = mfwd .entries @@ -112,7 +114,7 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { entry.next_hops ); - // Start snoop on RX side (matches IPv6 test pattern) + // Start snoop on Rx side (matches IPv6 test pattern) let dev_name_b = topol.nodes[1].port.name().to_string(); let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); let mut snoop_rx = SnoopGuard::start(&dev_name_b, &filter)?; @@ -127,7 +129,7 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { payload, )?; - // Wait for RX snoop to capture the packet (or timeout) + // Wait for Rx snoop to capture the packet (or timeout) let snoop_rx_output = snoop_rx.wait_with_timeout(Duration::from_secs(5))?; let stdout = String::from_utf8_lossy(&snoop_rx_output.stdout); @@ -151,7 +153,7 @@ fn test_xde_multicast_rx_ipv4() -> Result<()> { stdout.contains("test"), "expected payload substring 'test' in ASCII portion of snoop output:\n{stdout}" ); - // L2 dest: with current XDE/gateway pipeline, multicast RX to guests + // L2 dest: with current XDE/gateway pipeline, multicast Rx to guests // is delivered with broadcast dest MAC. snoop shows 16-bit grouped hex. assert!( stdout.to_ascii_lowercase().contains("ffff ffff ffff"), @@ -206,7 +208,8 @@ fn test_xde_multicast_rx_ipv6() -> Result<()> { let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; // M2P mapping: Use same admin-local address for underlay - let mcast_underlay: Ipv6Addr = "ff04::1:3".parse().unwrap(); + let mcast_underlay = + MulticastUnderlay::new("ff04::1:3".parse().unwrap()).unwrap(); // Set up multicast group with automatic cleanup on drop let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; @@ -217,9 +220,9 @@ fn test_xde_multicast_rx_ipv6() -> Result<()> { // Note: This is a single-sled test; all nodes share one underlay network. let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); - // Set up TX forwarding with Underlay replication to test underlay RX path. + // Set up Tx forwarding with Underlay replication to test underlay Rx path. // This causes packets to be sent to the underlay multicast address, then - // received back via the underlay RX path for same-sled delivery. + // received back via the underlay Rx path for same-sled delivery. mcast.set_forwarding(vec![( NextHopV6::new(fake_switch_addr, vni), Replication::Underlay, @@ -285,9 +288,10 @@ fn test_reject_link_local_underlay_ff02() -> Result<()> { let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); let link_local_underlay: Ipv6Addr = "ff02::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(link_local_underlay); let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { group: mcast_group.into(), - underlay: link_local_underlay, + underlay, }); assert!( result.is_err(), @@ -303,9 +307,10 @@ fn test_reject_global_underlay_ff0e() -> Result<()> { let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); let global_underlay: Ipv6Addr = "ff0e::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(global_underlay); let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { group: mcast_group.into(), - underlay: global_underlay, + underlay, }); assert!( result.is_err(), @@ -318,7 +323,8 @@ fn test_reject_global_underlay_ff0e() -> Result<()> { #[test] fn test_accept_admin_local_underlay_ff04() -> Result<()> { let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); - let admin_local: Ipv6Addr = "ff04::e001:263".parse().unwrap(); + let admin_local = + MulticastUnderlay::new("ff04::e001:263".parse().unwrap()).unwrap(); // MulticastGroup::new calls set_m2p internally and cleans up on drop. // This test verifies that admin-local (ff04::/16) addresses are accepted, @@ -342,7 +348,8 @@ fn test_multicast_config_no_spurious_traffic() -> Result<()> { let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let mcast_underlay: Ipv6Addr = "ff04::e001:2c8".parse().unwrap(); + let mcast_underlay = + MulticastUnderlay::new("ff04::e001:2c8".parse().unwrap()).unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs index ae346f12..68393059 100644 --- a/xde-tests/tests/multicast_validation.rs +++ b/xde-tests/tests/multicast_validation.rs @@ -20,7 +20,9 @@ use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv6Addr; use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::Replication; use oxide_vpc::api::Vni; @@ -50,12 +52,15 @@ fn test_subscribe_ff04_direct_without_m2p() -> Result<()> { xde_tests::two_node_topology_named("omicron1", "ff04a", "ff04b")?; // IPv6 admin-scoped multicast (ff04::/16) - already an underlay address - let underlay_mcast = Ipv6Addr::from([ + let underlay_mcast = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 99, - ]); + ])) + .unwrap(); - let res = topol.nodes[0].port.subscribe_multicast(underlay_mcast.into()); + let res = topol.nodes[0] + .port + .subscribe_multicast(Ipv6Addr::from(underlay_mcast).into()); assert!( res.is_ok(), @@ -125,10 +130,11 @@ fn test_double_subscribe() -> Result<()> { const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let underlay = Ipv6Addr::from([ + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 101, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; @@ -222,10 +228,11 @@ fn test_subscribe_then_clear_m2p() -> Result<()> { const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let underlay = Ipv6Addr::from([ + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 103, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; @@ -291,14 +298,15 @@ fn test_set_mcast_fwd_rejects_non_default_vni() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); - let underlay = Ipv6Addr::from([ + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 200, - ]); + ])) + .unwrap(); let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; - // Use a non-default VNI and multicast next-hop address checks separately + // Use a non-default VNI and multicast next hop address checks separately let bad_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI + 1)?; let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); @@ -321,14 +329,15 @@ fn test_set_mcast_fwd_rejects_multicast_next_hop() -> Result<()> { let hdl = OpteHdl::open()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 201]); - let underlay = Ipv6Addr::from([ + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 201, - ]); + ])) + .unwrap(); let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; - // Use a multicast address for next-hop (invalid) + // Use a multicast address for next hop (invalid) let bad_next_hop: Ipv6Addr = "ff04::1".parse().unwrap(); let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; @@ -340,7 +349,7 @@ fn test_set_mcast_fwd_rejects_multicast_next_hop() -> Result<()> { )], }); - assert!(res.is_err(), "set_mcast_fwd should reject multicast next-hop"); + assert!(res.is_err(), "set_mcast_fwd should reject multicast next hop"); Ok(()) } @@ -383,17 +392,18 @@ fn test_unsubscribe_ipv6_non_underlay_scopes() -> Result<()> { #[test] fn test_multiple_nexthops_accumulate() -> Result<()> { - // Test that set_forwarding accumulates next-hops like `swadm route add`: - // - Same underlay + different next-hop → add - // - Same underlay + same next-hop → replace replication mode + // Test that set_forwarding accumulates next hops like `swadm route add`: + // - Same underlay + different next hop → add + // - Same underlay + same next hop → replace replication mode let topol = xde_tests::two_node_topology_named("omicron1", "mnha", "mnhb")?; let mcast_group = Ipv4Addr::from([224, 1, 2, 104]); let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; - let underlay = Ipv6Addr::from([ + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 224, 1, 2, 104, - ]); + ])) + .unwrap(); let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; @@ -412,7 +422,7 @@ fn test_multiple_nexthops_accumulate() -> Result<()> { .iter() .find(|e| e.underlay == underlay) .expect("missing forwarding entry"); - assert_eq!(entry.next_hops.len(), 1, "Expected 1 next-hop after first set"); + assert_eq!(entry.next_hops.len(), 1, "Expected 1 next hop after first set"); assert_eq!(entry.next_hops[0].0.addr, switch_a); assert_eq!(entry.next_hops[0].1, Replication::External); @@ -430,7 +440,7 @@ fn test_multiple_nexthops_accumulate() -> Result<()> { assert_eq!( entry.next_hops.len(), 2, - "Expected 2 next-hops after second set" + "Expected 2 next hops after second set" ); let nexthop_a = entry @@ -469,7 +479,7 @@ fn test_multiple_nexthops_accumulate() -> Result<()> { assert_eq!( entry.next_hops.len(), 2, - "Expected 2 next-hops after updating switch_a" + "Expected 2 next hops after updating switch_a" ); let nexthop_a = entry @@ -496,3 +506,101 @@ fn test_multiple_nexthops_accumulate() -> Result<()> { Ok(()) } + +#[test] +fn test_unsubscribe_all() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "ualla", "uallb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 105]); + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 105, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Subscribe both ports + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 0 subscribe should succeed"); + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 1 subscribe should succeed"); + + // Verify both ports are subscribed + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert_eq!( + entry.ports.len(), + 2, + "Expected 2 ports subscribed before unsubscribe_all" + ); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + assert!( + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports + ); + + // Unsubscribe all ports from the group + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!(res.is_ok(), "mcast_unsubscribe_all should succeed, got: {res:?}"); + + // Verify no ports are subscribed + let subs = hdl.dump_mcast_subs()?; + let entry = subs.entries.iter().find(|e| e.underlay == underlay); + assert!( + entry.is_none(), + "Expected no subscription entry after unsubscribe_all, found: {entry:?}" + ); + + // Verify idempotence: calling again should succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!( + res.is_ok(), + "mcast_unsubscribe_all should be idempotent, got: {res:?}" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_all_without_m2p() -> Result<()> { + let _topol = + xde_tests::two_node_topology_named("omicron1", "uanm2pa", "uanm2pb")?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 106]); + + // Without M2P mapping, unsubscribe_all should be idempotent and succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + + assert!( + res.is_ok(), + "mcast_unsubscribe_all without M2P should succeed (idempotent), got: {res:?}" + ); + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 372d8ed1..01d3727f 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -12,8 +12,8 @@ use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; use alloc::vec::Vec; -use opte::api::IpAddr; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; @@ -41,9 +41,10 @@ impl VniMac { /// Shared ownership of an XDE port. /// -/// Using `Arc` ensures that ports remain live as long as any -/// `DevMap` snapshot references them, even if the port is removed from -/// the canonical map. This prevents use-after-free in concurrent delivery paths. +/// `Arc` provides shared ownership within a `DevMap`. Safety during +/// concurrent operations comes from callers holding read locks on the `DevMap` +/// for the duration of packet processing, which prevents port removal from +/// completing while any handler is active. type Dev = Arc; /// `BTreeMap`-accelerated lookup of XDE ports. @@ -61,7 +62,14 @@ type Dev = Arc; pub struct DevMap { devs: BTreeMap, names: BTreeMap, - mcast_groups: BTreeMap>, + /// Subscriptions keyed by underlay IPv6 multicast group (admin-scoped ff04::/16). + /// This table is sled-local and independent of any per-VPC VNI. VNI validation + /// and VPC isolation are enforced during inbound overlay decapsulation on the + /// destination port, not here. + /// + /// Rationale: multicast groups are fleet-wide; ports opt-in to receive a given + /// underlay group, and the overlay layer subsequently filters by VNI as appropriate. + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -89,10 +97,11 @@ impl DevMap { } /// Remove an `XdeDev` using its name. + /// + /// This also cleans up all multicast subscriptions for the removed port. pub fn remove(&mut self, name: &str) -> Option { let key = get_key(&self.names.remove(name)?); - // Clean up all multicast group subscriptions for this port self.mcast_groups.retain(|_group, subscribers| { subscribers.remove(&key); !subscribers.is_empty() @@ -109,21 +118,15 @@ impl DevMap { pub fn mcast_subscribe( &mut self, name: &str, - mcast_ip: IpAddr, + mcast_underlay: MulticastUnderlay, ) -> Result<(), OpteError> { - // Validate that the IP is actually a multicast address - if !mcast_ip.is_multicast() { - return Err(OpteError::BadState(format!( - "IP address {mcast_ip} is not a multicast address" - ))); - } - let port = self - .get_by_name(name) + .names + .get(name) .ok_or_else(|| OpteError::PortNotFound(name.into()))?; let key = get_key(port); - self.mcast_groups.entry(mcast_ip).or_default().insert(key); + self.mcast_groups.entry(mcast_underlay).or_default().insert(key); Ok(()) } @@ -132,26 +135,32 @@ impl DevMap { pub fn mcast_unsubscribe( &mut self, name: &str, - mcast_ip: IpAddr, + mcast_underlay: MulticastUnderlay, ) -> Result<(), OpteError> { let port = self - .get_by_name(name) + .names + .get(name) .ok_or_else(|| OpteError::PortNotFound(name.into()))?; let key = get_key(port); - if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_ip) { + if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_underlay) { set.into_mut().remove(&key); } Ok(()) } + /// Unsubscribe all ports from a given underlay multicast group. + pub fn mcast_unsubscribe_all(&mut self, mcast_underlay: MulticastUnderlay) { + self.mcast_groups.remove(&mcast_underlay); + } + /// Find the keys for all ports who want to receive a given multicast packet. pub fn mcast_listeners( &self, - mcast_ip: &IpAddr, + mcast_underlay: &MulticastUnderlay, ) -> Option> { - self.mcast_groups.get(mcast_ip).map(|v| v.iter()) + self.mcast_groups.get(mcast_underlay).map(|v| v.iter()) } /// Returns true if any multicast subscribers exist on this sled. @@ -194,11 +203,11 @@ impl DevMap { /// /// Any chains without a matching port are dropped. /// - /// Safety: This is safe to call even if ports are being concurrently - /// removed from the canonical `DevMap`, because callers hold an - /// `Arc` which contains `Arc` entries. The Arc reference - /// chain ensures all ports in this snapshot remain live for the duration of - /// delivery. + /// Safety: Callers must hold a read lock on this `DevMap` for the duration + /// of delivery. This prevents port removal from tearing down DLS/MAC + /// resources while delivery is in progress—management operations attempting + /// to remove a port will block when trying to acquire the write lock to + /// update the map. #[inline] pub fn deliver_all(&self, postbox: Postbox) { for (k, v) in postbox.drain() { @@ -209,7 +218,9 @@ impl DevMap { } /// Dump all multicast subscriptions as a vector of (group, ports) pairs. - pub fn dump_mcast_subscriptions(&self) -> Vec<(IpAddr, Vec)> { + pub fn dump_mcast_subscriptions( + &self, + ) -> Vec<(MulticastUnderlay, Vec)> { let mut out = Vec::new(); for (group, subs) in self.mcast_groups.iter() { let ports: Vec = subs @@ -217,7 +228,7 @@ impl DevMap { .filter_map(|vm| self.devs.get(vm)) .map(|d| d.devname.clone()) .collect(); - out.push((group.clone(), ports)); + out.push((*group, ports)); } out } diff --git a/xde/src/stats.rs b/xde/src/stats.rs index a7a1d498..ffed3f32 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -65,25 +65,25 @@ pub struct XdeStats { /// (unicast to boundary service for front panel egress). mcast_tx_external: KStatU64, /// The number of times a stale multicast listener was encountered - /// during local same-sled delivery (TX path). + /// during local same-sled delivery (Tx path). mcast_tx_stale_local: KStatU64, /// The number of multicast packets sent with no forwarding entry - /// in the mcast_fwd table (TX path). + /// in the mcast_fwd table (Tx path). mcast_tx_no_fwd_entry: KStatU64, /// The number of multicast packets received and delivered to local guest /// instances on this sled (decapsulated packets to same-sled OPTE ports). mcast_rx_local: KStatU64, /// The number of times a stale multicast listener was encountered - /// during local same-sled delivery (RX path). + /// during local same-sled delivery (Rx path). mcast_rx_stale_local: KStatU64, /// The number of multicast packets received with no local subscribers /// (no matching same-sled listeners for the multicast group). mcast_rx_no_subscribers: KStatU64, - /// The number of times a pullup operation failed during multicast TX + /// The number of times a pullup operation failed during multicast Tx /// (packet replication), causing a packet to be dropped. mcast_tx_pullup_fail: KStatU64, - /// The number of times a pullup operation failed during multicast RX + /// The number of times a pullup operation failed during multicast Rx /// (packet delivery/relay), causing a packet to be dropped. mcast_rx_pullup_fail: KStatU64, } diff --git a/xde/src/xde.rs b/xde/src/xde.rs index fab0c957..26ccf5b2 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -65,70 +65,55 @@ //! For this reason, we provide the datapath entrypoints with read-only shared //! copies of the central [`DevMap`]. //! * For Rx entrypoints, we allocate a `Vec>>`. Each CPU -//! on the system has its own slot within this `Vec`, such that there should -//! never be lock contention unless a port is being added/removed. The CPU ID -//! is then used as an index into this table, the Arc is cloned, and the lock -//! is dropped immediately. This makes readers lock-free and avoids blocking -//! management refreshes. -//! - Safety: The cloned `Arc` keeps all [`XdeDev`]s in that snapshot -//! alive ([`DevMap`] contains `Arc` entries), ensuring that delivery -//! via [`deliver_all()`](DevMap::deliver_all) always operates on live ports. -//! Physical mutex is held only during Arc clone (single atomic increment), -//! then dropped. +//! on the system has its own slot within this `Vec`, such that lock +//! contention only occurs when a port is being added/removed. The CPU ID is +//! used as an index into this table, the lock is acquired, and held for the +//! duration of packet processing (including delivery via +//! [`deliver_all()`](DevMap::deliver_all)), as all packet deliveries require +//! a live `XdeDev`. This prevents port removal from completing while any Rx +//! handler is active. //! * For Tx entrypoints, each `XdeDev` holds a per-port `KRwLock>`. -//! We prefer an RwLock here over a Mutex given that we can be called from -//! multiple threads, and our callers are not expected to be bound to a given -//! CPU. //! - Unicast to remote host: No `DevMap` needed, packets go directly to //! underlay. -//! - Hairpin (same-host unicast): Lazily clone per-port `DevMap` Arc for +//! - Hairpin (same-host unicast): Hold per-port `DevMap` read lock for //! local delivery. -//! - Multicast: Clone per-CPU `mcast_fwd` Arc once at start. Lazily clone -//! per-port `DevMap` Arc only if local subscribers exist. +//! - Multicast: Hold per-port `mcast_fwd` and `DevMap` read locks for the +//! duration of Tx processing (replication + local delivery). +//! We prefer an RwLock here over a Mutex given that we can be called from +//! multiple threads, and our callers are not expected to bound to a given +//! CPU. //! -//! Cloning the Arc (rather than holding read/lock guards) eliminates re-entrant -//! read deadlock risk and avoids blocking management operations for the duration -//! of packet chains. The cloned Arc ensures that no Rx/Tx contexts will attempt -//! to send a packet to a port which has been (or is being) removed -- holding -//! the Arc keeps the [`DevMap`] snapshot alive until packet processing is complete. -//! Since [`DevMap`] contains `Arc` entries, the Arc reference chain -//! guarantees all ports in the snapshot remain live throughout delivery (e.g., -//! [`deliver_all()`](DevMap::deliver_all)), preventing use-after-free even if -//! ports are concurrently removed from the canonical mapping. +//! Read locks are held for the duration of packet processing to prevent +//! use-after-free. Management operations attempting to remove a port will block +//! when acquiring the write lock to update the map, ensuring no Rx/Tx context +//! can hold references to a port while its DLS/MAC datapath is being torn down. +//! The lock hold time is bounded to packet processing duration. //! //! In the Rx case, loopback delivery or MAC->CPU oversubscription present some //! risk of contention. These are not expected paths in the product, but using //! them does not impact correctness. //! //! The remaining locking risk is double-locking a given Rx Mutex by the same -//! thread during the brief Arc clone operation. This results in a panic, but can -//! only happen if we transit the NIC's Rx path twice in the same stack (i.e. -//! Rx on NIC -> mac_rx on the OPTE port -> ... -> loopback delivery to underlay -//! device). This should be impossible, given that any packet sent upstack by XDE -//! must have a MAC address belonging to the OPTE port. -//! -//! The previous re-entrant read deadlock risk (`read[xde_mc_tx] -> write[ioctl] -//! -> read[xde_mc_tx]`) has been eliminated by using Arc clones instead of held -//! read guards. Once the Arc is cloned and the lock is dropped, subsequent -//! re-entries will acquire a fresh lock without conflict. Hairpin exchanges -//! (e.g., ARP -> ICMP ping, DHCP) can safely create deep stacks of the form -//! `(ip) -> xde_mc_tx -> (ip) -> xde_mc_tx -> ...` when used with zones. +//! thread during packet processing. This results in a panic, but can only +//! happen if we transit the NIC's Rx path twice in the same stack (i.e. Rx on +//! NIC -> mac_rx on the OPTE port -> ... -> loopback delivery to underlay +//! device). This should be impossible, given that any packet sent upstack by +//! XDE must have a MAC address belonging to the OPTE port. //! //! Note: -//! - We cannot afford to take the management lock (`TokenLock`) during any +//! - We cannot afford to take the management lock ([`TokenLock`]) during any //! dataplane operation. If a dataplane path ever needs to consult the //! central source of truth directly, the minimally acceptable pattern is a //! read of `state.devs.read()` (never the management token itself). In -//! practice, to further reduce contention on readers counters we avoid even -//! this by using per-CPU cached `Arc` snapshots for both RX and TX. +//! practice, to further reduce contention on reader counters we avoid even +//! this by using per-CPU cached `Arc` snapshots for Rx and per-port +//! `Arc` snapshots for Tx. Both are updated by `refresh_maps()` +//! whenever the canonical map changes. //! - Multicast forwarding state (`mcast_fwd`) follows the same model: a copy -//! is kept in each [`PerEntryState`] (per-CPU) and updated by `refresh_maps()` -//! whenever the canonical forwarding table changes. This ensures RX/TX always -//! observe a coherent snapshot without taking the management lock. We do not -//! maintain per-port copies (those were removed to avoid per-port RwLock -//! contention issues). +//! is kept per-port, updated by `refresh_maps()` whenever the canonical +//! forwarding table changes. //! -//! ### `TokenLock` and [`DevMap`] updates +//! ### [`TokenLock`] and [`DevMap`] updates //! The `TokenLock` primitive provides us with logical mutual exclusion around //! the underlay and the ability to modify the canonical [`DevMap`] -- without //! holding a `KMutex`. Management operations made by OPTE *will* upcall -- we @@ -137,23 +122,19 @@ //! threads trying to take the management lock must be able to take, e.g., //! a SIGSTOP. //! -//! Whenever the central [`DevMap`] is modified, we iterate through each reachable -//! [`XdeDev`] and underlay port, and for every instance of the cloned [`DevMap`] -//! and `mcast_fwd` we write()/lock() that entry, replace it with the new -//! contents, and drop the lock. This ensures that port removal cannot fully -//! proceed until the port is no longer usable from any Tx/Rx context and that -//! multicast delivery and forwarding use the matching snapshot. -//! -//! ### Teardown and reference cycles -//! The Arc-cloning strategy creates a reference cycle during normal operation: -//! underlay port → stream → ports_map (per-CPU) → [`DevMap`] → [`XdeDev`] → underlay port. -//! This is benign during operation but must be broken during teardown. +//! Whenever the central [`DevMap`] is modified, we call [`refresh_maps()`] +//! which iterates through each reachable [`XdeDev`] and underlay port. For +//! every instance of the [`DevMap`] Arc, we acquire the write lock (blocking if +//! Tx/Rx holds a read lock), swap the Arc, and release the write lock. This +//! ensures that port removal cannot fully proceed until no Tx/Rx context holds +//! references to the port. //! +//! ### Teardown //! When `clear_xde_underlay()` is called (after all ports have been removed), -//! we explicitly clear per-CPU cached `DevMap`s by replacing them with empty -//! snapshots. This breaks the cycle and allows underlay port Arcs to be unwrapped. -//! If brief in-flight TX chains still hold `DevMap` references, the unwrap returns -//! EBUSY and the caller can retry. Refs drain quickly once caches are cleared. +//! all per-CPU and per-port [`DevMap`] snapshots contain no ports (updated by +//! the final `refresh_maps()` calls during port deletion). The management lock +//! ensures no concurrent modifications, allowing underlay port Arcs to be +//! safely unwrapped. use crate::dev_map::DevMap; use crate::dev_map::ReadOnlyDevMap; @@ -162,6 +143,8 @@ use crate::dls; use crate::dls::DlsStream; use crate::dls::LinkId; use crate::ioctl::IoctlEnvelope; +use crate::ip::AF_INET; +use crate::ip::AF_INET6; use crate::mac; use crate::mac::ChecksumOffloadCapabs; use crate::mac::MacClient; @@ -204,17 +187,16 @@ use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; use core::ptr::addr_of_mut; -use core::sync::atomic::AtomicBool; -use core::sync::atomic::Ordering; use core::time::Duration; use illumos_sys_hdrs::mac::MacEtherOffloadFlags; use illumos_sys_hdrs::mac::MblkOffloadFlags; -use illumos_sys_hdrs::mac::mac_ether_offload_info_t; use illumos_sys_hdrs::*; use ingot::geneve::Geneve; use ingot::geneve::GeneveOpt; use ingot::geneve::GeneveRef; +use ingot::geneve::ValidGeneve; use ingot::types::HeaderLen; +use ingot::types::HeaderParse; use opte::ExecCtx; use opte::api::ClearLftReq; use opte::api::ClearUftReq; @@ -229,6 +211,7 @@ use opte::api::DumpUftResp; use opte::api::ListLayersReq; use opte::api::ListLayersResp; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::NoResp; use opte::api::OpteCmd; use opte::api::OpteCmdIoctl; @@ -243,11 +226,10 @@ use opte::ddi::mblk::MsgBlk; use opte::ddi::mblk::MsgBlkChain; use opte::ddi::sync::KMutex; use opte::ddi::sync::KRwLock; +use opte::ddi::sync::KRwLockReadGuard; use opte::ddi::sync::KRwLockWriteGuard; use opte::ddi::sync::TokenGuard; use opte::ddi::sync::TokenLock; -use opte::ddi::sync::clone_from_mutex; -use opte::ddi::sync::clone_from_rwlock; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; use opte::engine::NetworkImpl; @@ -289,6 +271,7 @@ use oxide_vpc::api::ListPortsResp; use oxide_vpc::api::McastForwardingEntry; use oxide_vpc::api::McastSubscribeReq; use oxide_vpc::api::McastSubscriptionEntry; +use oxide_vpc::api::McastUnsubscribeAllReq; use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; @@ -308,6 +291,7 @@ use oxide_vpc::engine::VpcParser; use oxide_vpc::engine::firewall; use oxide_vpc::engine::gateway; use oxide_vpc::engine::geneve::MssInfoRef; +use oxide_vpc::engine::geneve::OxideOptions; use oxide_vpc::engine::geneve::ValidOxideOption; use oxide_vpc::engine::nat; use oxide_vpc::engine::overlay; @@ -316,9 +300,9 @@ use oxide_vpc::engine::router; const ETHERNET_MTU: u16 = 1500; // Type alias for multicast forwarding table: -// Maps IPv6 destination addresses to their next-hop replication entries. +// Maps IPv6 destination addresses to their next hop replication entries. type McastForwardingTable = - BTreeMap>; + BTreeMap>; // Entry limits for the various flow tables. const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); @@ -416,6 +400,11 @@ unsafe extern "C" { group: uintptr_t, vni: uintptr_t, ); + pub safe fn __dtrace_probe_mcast__unsubscribe__all( + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); // Multicast dataplane problem probes pub safe fn __dtrace_probe_mcast__tx__pullup__fail(len: uintptr_t); @@ -613,6 +602,11 @@ pub struct XdeDev { // This is kept under an RwLock because we need to deliver // from potentially one or more threads unbound to a particular CPU. port_map: KRwLock>, + + // Each port has its own copy of the multicast forwarding table. + // Used in Tx path (which is not CPU-pinned), so stored per-port rather + // than per-CPU. + mcast_fwd: KRwLock>, } impl XdeDev { @@ -647,12 +641,7 @@ pub enum UnderlayIndex { #[repr(C)] struct PerEntryState { devs: KMutex>, - mcast_fwd: KRwLock>, - /// Fast-path check: `true` if any multicast subscribers exist on this sled. - /// Allows skipping DevMap lock entirely for multicast when no local listeners. - /// Updated by refresh_maps() on port add/remove. - has_mcast_subscribers: AtomicBool, - _pad: [u8; 31], + _pad: [u8; 48], } const _: () = assert!( @@ -662,12 +651,7 @@ const _: () = assert!( impl Default for PerEntryState { fn default() -> Self { - Self { - devs: KMutex::new(Arc::new(DevMap::new())), - mcast_fwd: KRwLock::new(Arc::new(BTreeMap::new())), - has_mcast_subscribers: AtomicBool::new(false), - _pad: [0u8; 31], - } + Self { devs: KMutex::new(Arc::new(DevMap::new())), _pad: [0u8; 48] } } } @@ -1055,6 +1039,11 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { hdlr_resp(&mut env, resp) } + OpteCmd::McastUnsubscribeAll => { + let resp = mcast_unsubscribe_all_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + OpteCmd::SetMcast2Phys => { let resp = set_m2p_hdlr(&mut env); hdlr_resp(&mut env, resp) @@ -1167,6 +1156,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { underlay_capab, routes: RouteCache::default(), port_map: KRwLock::new(Default::default()), + mcast_fwd: KRwLock::new(Arc::new(BTreeMap::new())), }); let xde_ref = Arc::get_mut(&mut xde).expect("only one instance of XDE exists"); @@ -1372,7 +1362,6 @@ fn refresh_maps( ) { let new_map = Arc::new(devs.clone()); let new_mcast_fwd = Arc::new(mcast_fwd.read().clone()); - let has_subscribers = new_map.has_mcast_subscribers(); // Update both underlay ports' per-CPU caches (u1 and u2). // Each underlay port has a Vec with one entry per CPU. @@ -1380,28 +1369,21 @@ fn refresh_maps( [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; for per_cpu_map in underlay_ports { for entry in per_cpu_map { - { - let mut map = entry.devs.lock(); - *map = Arc::clone(&new_map); - } - { - let mut mcast = entry.mcast_fwd.write(); - *mcast = Arc::clone(&new_mcast_fwd); - } - // Update fast-path flag for multicast optimization. - // Relaxed ordering is fine: stale reads are safe. If a CPU sees - // stale `false`, it skips obtaining DevMap entirely (no subscribers). - // If it sees stale `true`, it clones DevMap Arc and checks. - entry - .has_mcast_subscribers - .store(has_subscribers, Ordering::Relaxed); + let mut map = entry.devs.lock(); + *map = Arc::clone(&new_map); } } - // Update all ports' per-port maps. + // Update all ports' per-port maps and multicast state. for port in new_map.iter() { - let mut map = port.port_map.write(); - *map = Arc::clone(&new_map); + { + let mut map = port.port_map.write(); + *map = Arc::clone(&new_map); + } + { + let mut mcast = port.mcast_fwd.write(); + *mcast = Arc::clone(&new_mcast_fwd); + } } } @@ -1462,110 +1444,53 @@ fn clear_xde_underlay() -> Result { }); } - // Clear multicast forwarding table to release any references + // Clear multicast forwarding table token.mcast_fwd.write().clear(); - // Before taking ownership of the underlay Arcs, clear per-CPU cached - // `DevMap`s and multicast forwarding tables (in underlay ports' `PerEntryState`). - // This breaks snapshot cycles: underlay → per-CPU cache → `DevMap` → - // `XdeDev` → underlay. - // - // Note: Per-port `DevMap` caches (`XdeDev.port_map`) were already cleared - // when ports were deleted. This function only runs after all ports are - // removed. - if let Some(ul_ref) = token.underlay.as_ref() { - let empty_map = Arc::new(DevMap::new()); - let empty_mcast: Arc = Arc::new(BTreeMap::new()); - let underlay_ports = - [&ul_ref.u1.stream.ports_map, &ul_ref.u2.stream.ports_map]; - for per_cpu_map in underlay_ports { - for entry in per_cpu_map { - { - let mut map = entry.devs.lock(); - *map = Arc::clone(&empty_map); - } - { - let mut mcast = entry.mcast_fwd.write(); - *mcast = Arc::clone(&empty_mcast); - } - entry.has_mcast_subscribers.store(false, Ordering::Relaxed); - } - } - } - - // Early-check: ensure the underlay port Arcs are uniquely owned by - // XDE before we move them out. In-flight dataplane work may still hold - // references to these Arcs briefly after cache clearing. If so, return - // `EBUSY` so the caller can retry. - if let Some(ul_ref) = token.underlay.as_ref() { - if Arc::strong_count(&ul_ref.u1) != 1 - || Arc::strong_count(&ul_ref.u2) != 1 - { - return Err(OpteError::System { - errno: EBUSY, - msg: "underlay ports still have active references; retry teardown".into(), - }); - } - } - - // Take ownership of the underlay state now that caches are cleared and - // the Arcs appear uniquely owned. - let underlay = token.underlay.take().ok_or_else(|| OpteError::System { - errno: ENOENT, - msg: "underlay not initialized (already checked above)".into(), - })?; - - // Unwrap underlay port Arcs; if any references remain (e.g., in-flight - // dataplane), return EBUSY so caller can retry. - let XdeUnderlayPort { - name: u1_name, - siphon: u1_siphon, - stream: u1_stream, - .. - } = Arc::into_inner(underlay.u1).ok_or_else(|| { - warn!( - "clear_xde_underlay: u1 Arc has outstanding refs after cache clear" - ); - OpteError::System { - errno: EBUSY, - msg: "underlay u1 still has active references during teardown" - .into(), - } - })?; - - let XdeUnderlayPort { - name: u2_name, - siphon: u2_siphon, - stream: u2_stream, - .. - } = Arc::into_inner(underlay.u2).ok_or_else(|| { - warn!( - "clear_xde_underlay: u2 Arc has outstanding refs after cache clear" - ); - OpteError::System { - errno: EBUSY, - msg: "underlay u2 still has active references during teardown" - .into(), - } - })?; + if let Some(underlay) = token.underlay.take() { + // If the underlay references have leaked/spread beyond `XdeDev`s and not + // been cleaned up, we have a fatal programming error. + // We aren't using `Weak` references to these types either, so no strong + // references could be created. + // + // We know these must succeed given that the only holders of an + // `Arc` are `XdeState` (whose ref we have exclusively locked) + // and `XdeDev` (of which none remain). + let name = underlay.u1.name.clone(); + let u1 = Arc::into_inner(underlay.u1).unwrap_or_else(|| { + panic!("underlay u1 ({name}) must have one ref during teardown",) + }); - // Quiesce RX by dropping siphons; this removes the MAC callbacks and - // releases the siphon's Arc reference to the streams' parent. - drop(u1_siphon); - drop(u2_siphon); + let name = underlay.u2.name.clone(); + let u2 = Arc::into_inner(underlay.u2).unwrap_or_else(|| { + panic!("underlay u2 ({name}) must have one ref during teardown",) + }); - // Verify and close the DLS stream handles. After dropping siphons, the - // only remaining strong reference should be `u*_stream` itself. - for (name, stream) in [(u1_name, u1_stream), (u2_name, u2_stream)] { - if Arc::into_inner(stream).is_none() { - warn!( - "clear_xde_underlay: {name} DlsStream Arc has outstanding refs after siphon drop" - ); - return Err(OpteError::System { - errno: EBUSY, - msg: format!( - "underlay ({name}) DlsStream still has active references; retry teardown" - ), + for u in [u1, u2] { + // We have a chain of refs here: `MacSiphon` holds a ref to + // `DlsStream`. We explicitly drop them in order here to ensure + // there are no outstanding refs. + + // 1. Remove packet rx callback. + drop(u.siphon); + + // Although `xde_rx` can be called into without any running ports + // via the siphon handle, illumos guarantees that this callback won't + // be running here. `mac_siphon_clear` performs the moral equivalent of + // `mac_rx_barrier` -- the client's SRS is quiesced, and then restarted + // after the callback is removed. + // Because there are no ports and we hold the write/management lock, no + // one else will have or try to clone the Stream handle. + + // 2. Close the open stream handle. + // The only other hold on this `DlsStream` is via `u.siphon`, which + // we just dropped. The `unwrap_or_else` asserts that we have consumed them + // in the correct order. + Arc::into_inner(u.stream).unwrap_or_else(|| { + panic!( + "underlay ({}) must have no external refs to its DlsStream", + u.name + ) }); } } @@ -2169,57 +2094,30 @@ fn find_mcast_option_offset( pkt: &MsgBlk, geneve_offset: usize, ) -> Option { - const GENEVE_HDR_LEN: usize = 8; - const OPT_HDR_LEN: usize = 4; - const OXIDE_OPT_CLASS: u16 = 0x0129; - const MULTICAST_OPT_TYPE: u8 = 0x01; - - // Read Geneve header to get option length - let geneve_hdr = pkt.get(geneve_offset..geneve_offset + GENEVE_HDR_LEN)?; - let opt_len_words = (geneve_hdr[0] & 0x3F) as usize; // Bottom 6 bits of first byte - - if opt_len_words == 0 { - return None; // No options present - } - - let opts_start = geneve_offset + GENEVE_HDR_LEN; - let opts_end = opts_start + (opt_len_words * 4); - - // Belt-and-braces: ensure options area doesn't exceed packet length - if opts_end > pkt.len() { - return None; - } - - let mut offset = opts_start; - - while offset + OPT_HDR_LEN <= opts_end { - let opt_hdr = pkt.get(offset..offset + OPT_HDR_LEN)?; + let geneve_slice = pkt.get(geneve_offset..)?; + let (geneve_hdr, ..) = ValidGeneve::parse(geneve_slice).ok()?; - let class = u16::from_be_bytes([opt_hdr[0], opt_hdr[1]]); - let opt_type = opt_hdr[2] & 0x7F; // Mask out critical bit - let opt_data_words = (opt_hdr[3] & 0x1F) as usize; // Bottom 5 bits - let opt_data_len = opt_data_words * 4; + let mut cursor = geneve_offset + Geneve::MINIMUM_LENGTH; - if class == OXIDE_OPT_CLASS && opt_type == MULTICAST_OPT_TYPE { - // Found it! Return offset to option body - return Some(offset + OPT_HDR_LEN); + for opt in OxideOptions::from_raw(&geneve_hdr) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Multicast(_)) = opt.option.known() { + return Some(cursor + GeneveOpt::MINIMUM_LENGTH); } - - // Move to next option - offset += OPT_HDR_LEN + opt_data_len; + cursor += opt.packet_length(); } None } -/// Update the Oxide Multicast Geneve option's TX-only replication field. +/// Update the Oxide Multicast Geneve option's Tx-only replication field. /// -/// Locates the multicast option and rewrites the TX-only replication instruction in the -/// first byte of the option body (top 2 bits encode the replication mode). +/// Locates the multicast option and rewrites the Tx-only replication instruction +/// in the first byte of the option body (top 2 bits encode the replication mode). /// /// Returns `true` if the option was found and updated, `false` otherwise. /// -/// # Replication Encoding (TX-only) +/// # Replication Encoding (Tx-only) /// The replication field uses the top 2 bits of the first byte: /// - `External` (0): 0x00 /// - `Underlay` (1): 0x40 @@ -2254,7 +2152,7 @@ struct MulticastTxContext<'a> { encap_len: u32, inner_eth_len: usize, non_eth_payl_bytes: u32, - tun_meoi: &'a mac_ether_offload_info_t, + tun_meoi: &'a illumos_sys_hdrs::mac::mac_ether_offload_info_t, l4_hash: u32, } @@ -2264,9 +2162,6 @@ struct MulticastRxContext<'a> { vni: Vni, pkt: &'a MsgBlk, pullup_len: usize, - // Reserved for future use: may be needed for relay detection or debugging - _geneve_offset: usize, - _incoming_delivery_mode: Option, } /// Handle multicast packet forwarding for same-sled delivery and underlay @@ -2275,7 +2170,7 @@ struct MulticastRxContext<'a> { /// Always delivers to local same-sled subscribers regardless of replication mode. /// Routes to next hop unicast addresses for ALL replication modes to determine /// reachability and underlay port/MAC. Packet destination is always the multicast -/// address with multicast MAC. The [`Replication`] type is a TX-only instruction +/// address with multicast MAC. The [`Replication`] type is a Tx-only instruction /// telling the switch which port groups to replicate to: External (front panel), /// Underlay (other sleds), or Both. /// @@ -2287,15 +2182,15 @@ fn handle_mcast_tx<'a>( src_dev: &'a XdeDev, postbox: &mut TxPostbox, cpu_devs: Option<&'a DevMap>, - cpu_mcast_fwd: &'a Arc, + cpu_mcast_fwd: &'a McastForwardingTable, ) { - // DTrace probe: multicast TX entry + // DTrace probe: multicast Tx entry let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + (AF_INET as usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) } oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + (AF_INET6 as usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) } }; __dtrace_probe_mcast__tx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); @@ -2309,15 +2204,14 @@ fn handle_mcast_tx<'a>( + usize::from(ctx.tun_meoi.meoi_l4hlen); // Local same-sled delivery: always deliver to subscribers on this sled, - // independent of the TX-only Replication instruction (not an access control mechanism). - // The Replication type only affects how switches handle the packet on TX. + // independent of the Tx-only Replication instruction (not an access control mechanism). + // The Replication type only affects how switches handle the packet on Tx. // Subscription is keyed by underlay (outer) IPv6 multicast address. // If cpu_devs is None, we know from the fast-path check that no subscribers exist. if let Some(devs) = cpu_devs { - let group_key = { - let ip6 = oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); - oxide_vpc::api::IpAddr::from(ip6) - }; + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); if let Some(others) = devs.mcast_listeners(&group_key) { let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); for el in others { @@ -2329,7 +2223,7 @@ fn handle_mcast_tx<'a>( ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) else { opte::engine::dbg!( - "mcast TX external pullup failed: requested {} bytes", + "mcast Tx external pullup failed: requested {} bytes", pullup_len ); let xde = get_xde_state(); @@ -2344,11 +2238,11 @@ fn handle_mcast_tx<'a>( // DTrace probe: local delivery let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => ( - 2usize, + AF_INET as usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, ), oxide_vpc::api::IpAddr::Ip6(v6) => ( - 26usize, + AF_INET6 as usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, ), }; @@ -2374,14 +2268,15 @@ fn handle_mcast_tx<'a>( // Next hop forwarding: send packets to configured next hops. // // At the leaf level, we process all next hops in the forwarding table. - // Each next hop's `Replication` is a TX-only instruction telling the switch + // Each next hop's `Replication` is a Tx-only instruction telling the switch // which ports to replicate to: // - External: ports set for external multicast traffic (egress to external networks) // - Underlay: replicate to other sleds (using multicast outer dst) // - Both: both external and underlay replication // // We already have the Arc from the per-CPU cache, no need to clone. - if cpu_mcast_fwd.get(&ctx.underlay_dst).is_none() { + let underlay_key = MulticastUnderlay::new_unchecked(ctx.underlay_dst); + if cpu_mcast_fwd.get(&underlay_key).is_none() { __dtrace_probe_mcast__no__fwd__entry( &ctx.underlay_dst, ctx.vni.as_u32() as uintptr_t, @@ -2390,7 +2285,7 @@ fn handle_mcast_tx<'a>( xde.stats.vals.mcast_tx_no_fwd_entry().incr(1); } - if let Some(next_hops) = cpu_mcast_fwd.get(&ctx.underlay_dst) { + if let Some(next_hops) = cpu_mcast_fwd.get(&underlay_key) { // We found forwarding entries, replicate to each next hop for (next_hop, replication) in next_hops.iter() { // Clone packet with headers using pullup @@ -2398,7 +2293,7 @@ fn handle_mcast_tx<'a>( ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) else { opte::engine::dbg!( - "mcast TX next hop pullup failed: requested {} bytes", + "mcast Tx next hop pullup failed: requested {} bytes", pullup_len ); let xde = get_xde_state(); @@ -2413,7 +2308,7 @@ fn handle_mcast_tx<'a>( // // NextHopV6.addr = unicast switch address (for routing) // Outer dst IP = ctx.underlay_dst (multicast address from M2P) - // Geneve Replication is a TX-only instruction telling the switch + // Geneve Replication is a Tx-only instruction telling the switch // which port groups to use. let routing_dst = next_hop.addr; let actual_outer_dst = ctx.underlay_dst; @@ -2425,8 +2320,8 @@ fn handle_mcast_tx<'a>( let vni_be = next_hop.vni.as_u32().to_be_bytes(); vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits } - // Update Geneve multicast option to reflect underlay replication to - // prevent re-relay loops. + // Update Geneve multicast option with the Tx-only replication + // instruction for the switch. update_mcast_replication(&mut fwd_pkt, geneve_offset, *replication); // Route to switch unicast address to determine which underlay @@ -2459,54 +2354,32 @@ fn handle_mcast_tx<'a>( MsgBlk::wrap_mblk(mblk).unwrap() }; - // Replication is a TX-only instruction telling the switch which - // port groups to replicate to: + // Replication is a Tx-only instruction telling the switch which + // port groups to replicate to. Local same-sled delivery always + // occurs regardless of this setting. // - // Local same-sled delivery always occurs regardless of this - // TX-only setting. - // - // Note: Packet is sent once to the underlay. The switch reads the - // Geneve Replication field and performs the actual bifurcation. + // Packet is sent once to the underlay. The switch reads the Geneve + // Replication field and performs the actual bifurcation. + + // Prepare common data for DTrace probes + let outer_ip6 = + oxide_vpc::api::Ipv6Addr::from(actual_outer_dst.bytes()); + let (af, addr_ptr) = + (AF_INET6 as usize, &outer_ip6 as *const _ as uintptr_t); + + // Fire DTrace probes and increment stats based on replication mode match replication { oxide_vpc::api::Replication::Underlay => { - // DTrace probe: underlay forwarding - // Report on-wire multicast group as GROUP (underlay), - // and configured next-hop leaf address as NEXTHOP. - let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( - actual_outer_dst.bytes(), - ); - let (af, addr_ptr) = - (26usize, &outer_ip6 as *const _ as uintptr_t); __dtrace_probe_mcast__underlay__fwd( af, addr_ptr, ctx.vni.as_u32() as uintptr_t, &next_hop.addr, ); - - // Send to underlay - postbox.post_underlay( - underlay_idx, - TxHint::from_crc32(ctx.l4_hash), - final_pkt, - ); - - // Increment underlay forwarding stat let xde = get_xde_state(); xde.stats.vals.mcast_tx_underlay().incr(1); } oxide_vpc::api::Replication::Both => { - // Both mode: packet is sent to switch with "Both" - // replication flag. - // Switch will bifurcate to both underlay and external port - // groups. Fire both DTrace probes and increment both stats - // for observability. - let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( - actual_outer_dst.bytes(), - ); - let (af, addr_ptr) = - (26usize, &outer_ip6 as *const _ as uintptr_t); - __dtrace_probe_mcast__underlay__fwd( af, addr_ptr, @@ -2519,52 +2392,32 @@ fn handle_mcast_tx<'a>( ctx.vni.as_u32() as uintptr_t, &next_hop.addr, ); - - // Send to underlay (switch does bifurcation) - postbox.post_underlay( - underlay_idx, - TxHint::from_crc32(ctx.l4_hash), - final_pkt, - ); - - // Increment both stats since both replication paths are active let xde = get_xde_state(); xde.stats.vals.mcast_tx_underlay().incr(1); xde.stats.vals.mcast_tx_external().incr(1); } oxide_vpc::api::Replication::External => { - // DTrace probe: external forwarding - // Report on-wire multicast group as GROUP (underlay), - // and configured next-hop leaf address as NEXTHOP. - let outer_ip6 = oxide_vpc::api::Ipv6Addr::from( - actual_outer_dst.bytes(), - ); - let (af, addr_ptr) = - (26usize, &outer_ip6 as *const _ as uintptr_t); __dtrace_probe_mcast__external__fwd( af, addr_ptr, ctx.vni.as_u32() as uintptr_t, &next_hop.addr, ); - - // Increment external forwarding stat let xde = get_xde_state(); xde.stats.vals.mcast_tx_external().incr(1); - - // External mode: Unicast Geneve to switch (boundary service) via underlay. - // Switch decaps and replicates to ports set for external multicast traffic - // (egress to external networks, leaving the underlay). - postbox.post_underlay( - underlay_idx, - TxHint::from_crc32(ctx.l4_hash), - final_pkt, - ); } - _ => { - // Reserved: should not reach here + oxide_vpc::api::Replication::Reserved => { + // Reserved: drop packet + continue; } } + + // Send to underlay (common for all valid replication modes) + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, + ); } } } @@ -2574,8 +2427,8 @@ fn handle_mcast_tx<'a>( /// OPTE is always a leaf node in the multicast replication tree. /// This function only delivers packets to local subscribers. /// -/// The Replication type is TX-only (instructions to the switch), so the -/// replication field is ignored on RX. Local delivery is based purely on +/// The Replication type is Tx-only (instructions to the switch), so the +/// replication field is ignored on Rx. Local delivery is based purely on /// subscriptions. fn handle_mcast_rx( ctx: MulticastRxContext, @@ -2583,23 +2436,22 @@ fn handle_mcast_rx( devs: &DevMap, postbox: &mut Postbox, ) { - // DTrace probe: multicast RX entry + // DTrace probe: multicast Rx entry let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, v4 as *const _ as uintptr_t) + (AF_INET as usize, v4 as *const _ as uintptr_t) } oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, v6 as *const _ as uintptr_t) + (AF_INET6 as usize, v6 as *const _ as uintptr_t) } }; __dtrace_probe_mcast__rx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); // Subscription is keyed by underlay (outer) IPv6 multicast address. // This uniquely identifies the multicast group across the fleet. - let group_key = { - let ip6 = oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); - oxide_vpc::api::IpAddr::from(ip6) - }; + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); // Deliver to all local subscribers. VNI validation and VPC isolation // are handled by OPTE's inbound overlay layer. @@ -2608,7 +2460,7 @@ fn handle_mcast_rx( let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) else { opte::engine::dbg!( - "mcast RX pullup failed: requested {} bytes", + "mcast Rx pullup failed: requested {} bytes", ctx.pullup_len ); let xde = get_xde_state(); @@ -2620,13 +2472,13 @@ fn handle_mcast_rx( }; match devs.get_by_key(*el) { Some(dev) => { - // DTrace probe: RX local delivery + // DTrace probe: Rx local delivery let (af, addr_ptr) = match &ctx.inner_dst { oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, v4 as *const _ as uintptr_t) + (AF_INET as usize, v4 as *const _ as uintptr_t) } oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, v6 as *const _ as uintptr_t) + (AF_INET6 as usize, v6 as *const _ as uintptr_t) } }; __dtrace_probe_mcast__local__delivery( @@ -2694,37 +2546,29 @@ unsafe extern "C" fn xde_mc_tx( let mut hairpin_chain = MsgBlkChain::empty(); let mut tx_postbox = TxPostbox::new(); - // Clone per-CPU mcast forwarding table Arc and drop lock immediately. - // This makes the reader lock-free and avoids blocking management refreshes. - let cpu_index = current_cpu().seq_id; - let cpu_entry = &src_dev.u1.stream.ports_map[cpu_index]; - let mcast_fwd = clone_from_rwlock(&cpu_entry.mcast_fwd); - - // Lazily clone per-port DevMap Arc for hairpin/local delivery. - // Cloning the Arc (not holding a read guard) eliminates re-entrant - // read deadlock risk and avoids blocking management operations. - let mut cached_devmap: Option> = None; + // We don't need to read-lock port_map or mcast_fwd unless we actually need them. + // Locks are acquired lazily on first use and then held for the duration of + // packet processing. This prevents port removal from completing while any Tx + // handler holds references (management operations block on the write lock). + let mut port_map = None; + let mut mcast_fwd = None; while let Some(pkt) = chain.pop_front() { xde_mc_tx_one( src_dev, pkt, &mut tx_postbox, - cpu_entry, - &mut cached_devmap, - &mcast_fwd, + &mut port_map, + &mut mcast_fwd, &mut hairpin_chain, ); } let (local_pkts, [u1_pkts, u2_pkts]) = tx_postbox.deconstruct(); - // Local same-sled delivery (via mac_rx to guest ports) is safe. - // Lazily clone DevMap if we have anything to deliver. - if !local_pkts.is_empty() { - let devs = cached_devmap - .get_or_insert_with(|| clone_from_rwlock(&src_dev.port_map)); - devs.deliver_all(local_pkts); + // Local same-sled delivery (via mac_rx to guest ports). + if let Some(port_map) = port_map { + port_map.deliver_all(local_pkts); } // All deliver/tx calls will NO-OP if the sent chain is empty. @@ -2750,9 +2594,8 @@ fn xde_mc_tx_one<'a>( src_dev: &'a XdeDev, mut pkt: MsgBlk, postbox: &mut TxPostbox, - cpu_entry: &'a PerEntryState, - cached_devmap: &mut Option>, - mcast_fwd: &'a Arc, + port_map: &mut Option>>, + mcast_fwd: &mut Option>>, hairpin_chain: &mut MsgBlkChain, ) { let parser = src_dev.port.network().parser(); @@ -2877,9 +2720,8 @@ fn xde_mc_tx_one<'a>( if ip6_src == ip6_dst { // Hairpin loopback: same-host delivery let key = VniMac::new(vni, ether_dst); - let devs = cached_devmap.get_or_insert_with(|| { - clone_from_rwlock(&src_dev.port_map) - }); + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); if let Some(dst_dev) = devs.get_by_key(key) { // We have found a matching Port on this host; "loop back" // the packet into the inbound processing path of the @@ -2907,15 +2749,13 @@ fn xde_mc_tx_one<'a>( // Multicast interception: All packets (unicast and multicast) go // through normal `port.process()` which applies router/firewall // rules and uses M2P for multicast encapsulation. Here, we - // intercept multicast packets for replication to multiple next-hops + // intercept multicast packets for replication to multiple next hops // and local delivery to subscribers. // // Check if this is a multicast packet by examining the outer IPv6 // destination. For multicast, OPTE should have set it to an // ff0x:: address (via M2P table). - let is_mcast_packet = ip6_dst.is_multicast(); - - if is_mcast_packet { + if ip6_dst.is_multicast() { // This is a multicast packet, so we determine the inner // destination from the packet contents or use a fallback let inner_dst = inner_dst_ip.unwrap_or_else(|| { @@ -2937,20 +2777,12 @@ fn xde_mc_tx_one<'a>( } }); - // Lazily obtain per-port DevMap for local delivery. - // Use fast-path check to avoid locking when no local subscribers exist. - let devs = if cached_devmap.is_none() - && !cpu_entry.has_mcast_subscribers.load(Ordering::Relaxed) - { - // Fast path: no subscribers, skip `DevMap` entirely - None - } else { - // Either we already have `DevMap`, or we need to get it - Some(cached_devmap.get_or_insert_with(|| { - clone_from_rwlock(&src_dev.port_map) - })) - }; - + // Acquire locks lazily on first multicast packet. + // Once acquired, locks are held for the duration of Tx processing. + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); + let fwd_table = + mcast_fwd.get_or_insert_with(|| src_dev.mcast_fwd.read()); handle_mcast_tx( MulticastTxContext { inner_dst, @@ -2965,8 +2797,8 @@ fn xde_mc_tx_one<'a>( }, src_dev, postbox, - devs.as_deref().map(|v| &**v), - mcast_fwd, + Some(devs), + fwd_table, ); return; } @@ -3035,9 +2867,9 @@ fn xde_mc_tx_one<'a>( // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE - // for the mac associated with the IRE nexthop to fill in + // for the mac associated with the IRE next hop to fill in // the outer frame of the packet. Also return the underlay - // device associated with the nexthop + // device associated with the next hop // // As route lookups are fairly expensive, we can cache their // results for a given dst + entropy. These have a fairly tight @@ -3295,15 +3127,12 @@ unsafe extern "C" fn xde_rx( let mut count = 0; let mut len = 0; - // Clone per-CPU DevMap Arc and drop lock immediately. - // This makes RX readers lock-free and avoids blocking management refreshes. - // - // Safety: `devmap` holds `Arc`, which contains `Arc` entries. - // This reference chain keeps all ports in this snapshot alive throughout - // packet processing, ensuring `deliver_all()` operates on live `XdeDev` - // instances even if ports are concurrently removed from the canonical map. + // Hold the read lock on the per-CPU DevMap for the duration of Rx processing. + // This prevents port removal from completing until no Rx handler holds references. + // Management operations will block briefly during lock hold, but the critical + // section is bounded to packet processing time (swap Arc during refresh). let cpu_index = current_cpu().seq_id; - let devmap = clone_from_mutex(&stream.ports_map[cpu_index].devs); + let devmap = stream.ports_map[cpu_index].devs.lock(); let mut postbox = Postbox::new(); while let Some(pkt) = chain.pop_front() { @@ -3410,37 +3239,12 @@ fn xde_rx_one( ); let vni = meta.outer_encap.vni(); - // Validate Geneve options per RFC 8926 - if let Err(e) = - oxide_vpc::engine::geneve::validate_options(&meta.outer_encap) - { - stat_parse_error(Direction::In, &e); - opte::engine::dbg!( - "Invalid Geneve options in multicast packet: {:?}", - e - ); - bad_packet_parse_probe(None, Direction::In, mblk_addr, &e); - return Some(pkt); - } - // Extract inner destination IP for multicast processing let inner_dst = match &meta.inner_l3 { ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), }; - // Extract multicast delivery mode from Geneve options - // (Safe to be lenient for non-critical parse errors after validation above) - let incoming_delivery_mode = - oxide_vpc::engine::geneve::extract_multicast_replication( - &meta.outer_encap, - ); - - // Calculate Geneve offset from parsed outer header lengths (robust to VLANs and IPv6 extensions) - let geneve_offset = meta.outer_eth.packet_length() - + meta.outer_v6.packet_length() - + meta.outer_udp.packet_length(); - // Drop the parsed packet before calling handle_mcast_rx drop(parsed_pkt); @@ -3453,8 +3257,6 @@ fn xde_rx_one( vni, pkt: &pkt, pullup_len, - _geneve_offset: geneve_offset, - _incoming_delivery_mode: incoming_delivery_mode, }, stream, devs, @@ -3770,34 +3572,28 @@ fn dump_v2p_hdlr() -> Result { fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetMcast2PhysReq = env.copy_in_req()?; - // Validate underlay multicast address is admin-local IPv6 (ff04::/16 only) - // Per Omicron constraints: underlay must be admin-local for rack-internal routing - if !req.underlay.is_admin_scoped_multicast() { - return Err(OpteError::InvalidUnderlayMulticast(format!( - "underlay multicast address must be admin-local IPv6 (ff04::/16), got: {}", - req.underlay - ))); - } + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); let state = get_xde_state(); - // Underlay address validated above as admin-local (ff04::/16) - state.m2p.set(req.group, overlay::MulticastUnderlay(req.underlay)); + state.m2p.set(req.group, underlay); // DTrace: multicast map set let (af, group_ptr): (usize, uintptr_t) = match req.group { oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), }; __dtrace_probe_mcast__map__set( af as uintptr_t, group_ptr, - &req.underlay, + &underlay.addr(), vni.as_u32() as uintptr_t, ); Ok(NoResp::default()) @@ -3807,6 +3603,9 @@ fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { let req: ClearMcast2PhysReq = env.copy_in_req()?; + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); let state = get_xde_state(); @@ -3815,16 +3614,17 @@ fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { // DTrace: multicast map clear let (af, group_ptr): (usize, uintptr_t) = match req.group { oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), }; __dtrace_probe_mcast__map__clear( af as uintptr_t, group_ptr, - &req.underlay, + &underlay.addr(), vni.as_u32() as uintptr_t, ); Ok(NoResp::default()) @@ -3859,57 +3659,51 @@ fn set_mcast_forwarding_hdlr( let req: SetMcastForwardingReq = env.copy_in_req()?; let state = get_xde_state(); - // Validate underlay address is admin-local IPv6 multicast (ff04::/16 only) - if !req.underlay.is_admin_scoped_multicast() { - return Err(OpteError::InvalidUnderlayMulticast(format!( - "underlay multicast address must be admin-local IPv6 (ff04::/16), got: {}", - req.underlay - ))); - } + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication modes. // NextHopV6.addr must be unicast (switch address for routing). // The packet will be sent to the multicast address (req.underlay). - for (nh, _rep) in &req.next_hops { - if nh.vni.as_u32() != DEFAULT_MULTICAST_VNI { + for (next_hop, _rep) in &req.next_hops { + if next_hop.vni.as_u32() != DEFAULT_MULTICAST_VNI { return Err(OpteError::System { errno: EINVAL, msg: format!( - "multicast next-hop VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", - nh.vni.as_u32() + "multicast next hop VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", + next_hop.vni.as_u32() ), }); } // NextHopV6.addr must be unicast (the switch endpoint for routing). // The actual packet destination is the multicast address (req.underlay). - if nh.addr.is_multicast() { + if next_hop.addr.is_multicast() { return Err(OpteError::System { errno: EINVAL, msg: format!( "NextHopV6.addr must be unicast (switch address), got multicast: {}", - nh.addr + next_hop.addr ), }); } } - // Record next-hop count and copy underlay before consuming the vector + // Record next hop count before consuming the vector let next_hop_count = req.next_hops.len(); - let underlay = req.underlay; let token = state.management_lock.lock(); { let mut mcast_fwd = token.mcast_fwd.write(); - // Get or create the next-hop map for this underlay address + // Get or create the next hop map for this underlay address let next_hop_map = mcast_fwd.entry(underlay).or_insert_with(BTreeMap::new); - // Insert/update next-hops: same next-hop addr → replace replication mode, - // different next-hop addr → add new entry (like swadm route add) - for (nh, rep) in req.next_hops { - next_hop_map.insert(nh, rep); + // Insert/update next hops: same next hop addr → replace replication mode, + // different next hop addr → add new entry (like `swadm route add`) + for (next_hop, rep) in req.next_hops { + next_hop_map.insert(next_hop, rep); } drop(mcast_fwd); @@ -3925,7 +3719,7 @@ fn set_mcast_forwarding_hdlr( // DTrace: forwarding set __dtrace_probe_mcast__fwd__set( - &underlay, + &underlay.addr(), next_hop_count as uintptr_t, DEFAULT_MULTICAST_VNI as uintptr_t, ); @@ -3940,10 +3734,13 @@ fn clear_mcast_forwarding_hdlr( let req: ClearMcastForwardingReq = env.copy_in_req()?; let state = get_xde_state(); + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + let token = state.management_lock.lock(); { let mut mcast_fwd = token.mcast_fwd.write(); - mcast_fwd.remove(&req.underlay); + mcast_fwd.remove(&underlay); drop(mcast_fwd); } @@ -3957,7 +3754,7 @@ fn clear_mcast_forwarding_hdlr( // DTrace: forwarding clear __dtrace_probe_mcast__fwd__clear( - &req.underlay, + &underlay.addr(), DEFAULT_MULTICAST_VNI as uintptr_t, ); @@ -3975,7 +3772,10 @@ fn dump_mcast_forwarding_hdlr() -> Result { .iter() .map(|(underlay, next_hops)| McastForwardingEntry { underlay: *underlay, - next_hops: next_hops.iter().map(|(nh, rep)| (*nh, *rep)).collect(), + next_hops: next_hops + .iter() + .map(|(next_hop, rep)| (*next_hop, *rep)) + .collect(), }) .collect(); @@ -3990,10 +3790,8 @@ fn dump_mcast_subscriptions_hdlr() let mut entries: alloc::vec::Vec = alloc::vec::Vec::new(); - for (group, ports) in devs.dump_mcast_subscriptions().into_iter() { - if let opte::api::IpAddr::Ip6(underlay) = group { - entries.push(McastSubscriptionEntry { underlay, ports }); - } + for (underlay, ports) in devs.dump_mcast_subscriptions().into_iter() { + entries.push(McastSubscriptionEntry { underlay, ports }); } Ok(DumpMcastSubscriptionsResp { entries }) @@ -4004,7 +3802,7 @@ fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { let req: McastSubscribeReq = env.copy_in_req()?; let state = get_xde_state(); - // Update under management lock so we can refresh DevMap views used by TX/RX + // Update under management lock so we can refresh DevMap views used by Tx/Rx let token = state.management_lock.lock(); { let mut devs = token.devs.write(); @@ -4022,12 +3820,12 @@ fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { // If an overlay->underlay mapping exists, use it; otherwise, if the // provided address is already an admin-scoped multicast (ff04::/16), // accept it as-is. Otherwise, reject. - if let Some(mu) = + if let Some(underlay_group) = state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) { - oxide_vpc::api::IpAddr::Ip6(mu.0) - } else if ip6.is_admin_scoped_multicast() { - oxide_vpc::api::IpAddr::Ip6(ip6) + underlay_group + } else if let Ok(underlay_group) = MulticastUnderlay::new(ip6) { + underlay_group } else { return Err(OpteError::BadState( "no underlay mapping for IPv6 multicast group".into(), @@ -4038,8 +3836,8 @@ fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { // IPv4 overlay groups must have an M2P mapping; the subscription key // is the underlay IPv6 multicast. Without a mapping, reject with // a clear message (callers may rely on this distinction). - if let Some(mu) = state.m2p.get(&req.group) { - oxide_vpc::api::IpAddr::Ip6(mu.0) + if let Some(underlay_group) = state.m2p.get(&req.group) { + underlay_group } else { return Err(OpteError::BadState( "no underlay mapping for IPv4 multicast group".into(), @@ -4051,14 +3849,10 @@ fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { devs.mcast_subscribe(&req.port_name, group_key)?; // DTrace: subscribe - let (af, group_ptr): (usize, uintptr_t) = match group_key { - oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) - } - }; + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); if let Ok(port_cstr) = CString::new(req.port_name.clone()) { __dtrace_probe_mcast__subscribe( port_cstr.as_ptr() as uintptr_t, @@ -4087,7 +3881,7 @@ fn mcast_unsubscribe_hdlr( let req: McastUnsubscribeReq = env.copy_in_req()?; let state = get_xde_state(); - // Update under management lock so we can refresh DevMap views used by TX/RX + // Update under management lock so we can refresh DevMap views used by Tx/Rx let token = state.management_lock.lock(); { let mut devs = token.devs.write(); @@ -4113,65 +3907,24 @@ fn mcast_unsubscribe_hdlr( // For unsubscribe, if no M2P mapping exists, we return success (no-op). // This makes unsubscribe idempotent and handles cleanup race conditions // where M2P mappings may be removed before unsubscribe is called. - let group_key = - match req.group { - oxide_vpc::api::IpAddr::Ip6(ip6) => { - if let Some(mu) = - state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) - { - oxide_vpc::api::IpAddr::Ip6(mu.0) - } else { - // For IPv6 without M2P mapping, we can't determine the - // exact underlay address due to Omicron's XOR folding. - // `External` IPv6 addresses are mapped to different - // underlay IPv6 addresses (both in ff04::/16 but - // different values). Without the mapping, we return - // success. The subscription was either never created - // (because subscribe would have failed without M2P) - // or was already cleaned up when the M2P was removed. - refresh_maps( - devs, - token.underlay.as_ref().expect( - "underlay must exist while ports exist", - ), - &token.mcast_fwd, - ); - return Ok(NoResp::default()); - } - } - oxide_vpc::api::IpAddr::Ip4(_v4) => { - if let Some(mu) = state.m2p.get(&req.group) { - oxide_vpc::api::IpAddr::Ip6(mu.0) - } else { - // For IPv4 without M2P mapping, we can't determine the underlay - // group, but we should still succeed (idempotent cleanup). - // Since subscriptions use underlay IPv6 addresses as keys, - // and we don't know what that would have been, we simply - // return success. The subscription was either never created - // (because subscribe would have failed without M2P) or was - // already cleaned up when the M2P was removed. - refresh_maps( - devs, - token.underlay.as_ref().expect( - "underlay must exist while ports exist", - ), - &token.mcast_fwd, - ); - return Ok(NoResp::default()); - } - } - }; + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + }; devs.mcast_unsubscribe(&req.port_name, group_key)?; // DTrace: unsubscribe - let (af, group_ptr): (usize, uintptr_t) = match group_key { - oxide_vpc::api::IpAddr::Ip4(v4) => { - (2usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) - } - oxide_vpc::api::IpAddr::Ip6(v6) => { - (26usize, AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t) - } - }; + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); if let Ok(port_cstr) = CString::new(req.port_name.clone()) { __dtrace_probe_mcast__unsubscribe( port_cstr.as_ptr() as uintptr_t, @@ -4193,6 +3946,64 @@ fn mcast_unsubscribe_hdlr( Ok(NoResp::default()) } +#[unsafe(no_mangle)] +fn mcast_unsubscribe_all_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeAllReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + + // Reject non-multicast input + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe-all, if no M2P mapping exists, we return success (no-op). + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + }; + + devs.mcast_unsubscribe_all(group_key); + // DTrace: unsubscribe-all + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + __dtrace_probe_mcast__unsubscribe__all( + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + } + + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn list_layers_hdlr( env: &mut IoctlEnvelope,