diff --git a/.github/buildomat/jobs/test.sh b/.github/buildomat/jobs/test.sh index 236234a0..59a62dae 100755 --- a/.github/buildomat/jobs/test.sh +++ b/.github/buildomat/jobs/test.sh @@ -82,3 +82,15 @@ pfexec add_drv xde banner "test" pfexec chmod +x /input/xde/work/test/loopback pfexec /input/xde/work/test/loopback --nocapture + +# Multicast tests must run with --test-threads=1 because they share +# hardcoded device names (xde_test_sim0/1, xde_test_vnic0/1) that conflict +# when tests run in parallel +pfexec chmod +x /input/xde/work/test/multicast_rx +pfexec /input/xde/work/test/multicast_rx --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_multi_sub +pfexec /input/xde/work/test/multicast_multi_sub --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_validation +pfexec /input/xde/work/test/multicast_validation --nocapture --test-threads=1 diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index 3abe2881..82baf11c 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -14,6 +14,9 @@ #: "=/work/release/xde_link.so", #: "=/work/release/xde_link.so.sha256", #: "=/work/test/loopback", +#: "=/work/test/multicast_rx", +#: "=/work/test/multicast_multi_sub", +#: "=/work/test/multicast_validation", #: "=/work/xde.conf", #: ] #: @@ -116,5 +119,23 @@ loopback_test=$( cargo build -q --test loopback --message-format=json |\ jq -r "select(.profile.test == true) | .filenames[]" ) +cargo build --test multicast_rx +multicast_rx_test=$( + cargo build -q --test multicast_rx --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_multi_sub +multicast_multi_sub_test=$( + cargo build -q --test multicast_multi_sub --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_validation +multicast_validation_test=$( + cargo build -q --test multicast_validation --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) mkdir -p /work/test cp $loopback_test /work/test/loopback +cp $multicast_rx_test /work/test/multicast_rx +cp $multicast_multi_sub_test /work/test/multicast_multi_sub +cp $multicast_validation_test /work/test/multicast_validation diff --git a/Cargo.lock b/Cargo.lock index 53b696a9..56691460 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1312,6 +1312,7 @@ dependencies = [ name = "opte-test-utils" version = "0.1.0" dependencies = [ + "anyhow", "opte", "oxide-vpc", "pcap-parser", @@ -2665,6 +2666,7 @@ dependencies = [ "anyhow", "libnet", "opte-ioctl", + "opte-test-utils", "oxide-vpc", "rand", "slog", diff --git a/README.adoc b/README.adoc index 47865d64..c6418e08 100644 --- a/README.adoc +++ b/README.adoc @@ -42,6 +42,7 @@ More detail on our benchmarks can be found in xref:bench/README.adoc[bench/READM * https://rfd.shared.oxide.computer/rfd/0009[RFD 9: Networking Considerations] * https://rfd.shared.oxide.computer/rfd/0021[RFD 21: User Networking API] * https://rfd.shared.oxide.computer/rfd/0063[RFD 63: Network Architecture] +* https://rfd.shared.oxide.computer/rfd/488[RFD 488: Multicast] * https://www.microsoft.com/en-us/research/wp-content/uploads/2017/03/vfp-nsdi-2017-final.pdf[Microsoft's VFP] == Directory Index diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 219bf555..67cd2dc7 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -16,6 +16,7 @@ use opte::api::Ipv4Addr; use opte::api::Ipv6Addr; use opte::api::MAJOR_VERSION; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::Vni; use opte::print::print_layer; use opte::print::print_list_layers; @@ -27,8 +28,10 @@ use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; use oxide_vpc::api::BOUNDARY_SERVICES_VNI; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DhcpCfg; @@ -39,22 +42,30 @@ use oxide_vpc::api::FirewallRule; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Cfg; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::Ports; use oxide_vpc::api::ProtoFilter; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::TunnelEndpoint; use oxide_vpc::api::VpcCfg; +use oxide_vpc::print::print_mcast_fwd; +use oxide_vpc::print::print_mcast_subs; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; use std::io; @@ -225,6 +236,93 @@ enum Command { /// Clear a virtual-to-boundary mapping ClearV2B { prefix: IpCidr, tunnel_endpoint: Vec }, + /// Set a multicast forwarding entry + /// + /// Adds or updates a next hop for the specified underlay multicast address. + /// Multiple next hops can be configured for the same underlay address by + /// running this command multiple times (like `swadm route add`). If the + /// same next hop is specified again, its replication mode is updated. + /// + /// OPTE routes to `next_hop` (unicast switch address) to determine which + /// underlay port to use, then sends the packet to underlay (multicast) with + /// multicast MAC. The switch matches the outer dst IP (multicast) and + /// Geneve replication tag. + SetMcastFwd { + /// The underlay multicast IPv6 address (admin-local scope ff04::/16). + /// This is the outer IPv6 destination in transmitted packets. + underlay: MulticastUnderlay, + /// The unicast IPv6 address of the switch for routing (e.g., fd00::1). + /// OPTE uses this to determine which underlay port to use via the + /// illumos routing table. Multiple next hops can be added by + /// running this command multiple times with the same underlay address. + next_hop: Ipv6Addr, + /// Tx-only replication instruction (tells the switch which port groups to use): + /// - External: front panel ports (decapped, egress to external networks) + /// - Underlay: sled-to-sled ports (underlay multicast replication) + /// - Both: both external and underlay (bifurcated) + /// + /// Local same-sled delivery always happens via subscriptions regardless + /// of this setting. + replication: Replication, + }, + + /// Clear a multicast forwarding entry + ClearMcastFwd { + /// The underlay multicast IPv6 address (admin-local scope ff04::/16) + underlay: MulticastUnderlay, + }, + + /// Dump the multicast forwarding table + DumpMcastFwd, + + /// Dump multicast subscriptions (group -> ports on this sled) + DumpMcastSubs, + + /// Subscribe a port to a multicast group + /// + /// Allows a port to receive multicast traffic for the specified group. + /// The group address is an overlay multicast address which is translated + /// to an underlay IPv6 multicast address via the M2P (Multicast-to-Physical) + /// mapping table. + /// + /// Subscriptions are local to this sled and control Rx (receive). For Tx + /// (transmit), configure multicast forwarding via set-mcast-fwd. + McastSubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe a port from a multicast group + /// + /// Removes a port's subscription to a multicast group, preventing it from + /// receiving traffic for that group. This is the inverse of mcast-subscribe. + /// + /// If the M2P mapping for the group has already been removed, this operation + /// succeeds as a no-op. + McastUnsubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe all ports from a multicast group + /// + /// Removes all port subscriptions for a given multicast group on this sled + /// in a single operation. This comes in handy for decommissioning a + /// multicast group entirely on this sled. + /// + /// If the M2P mapping for the group has already been removed, this + /// operation succeeds as a no-op. + McastUnsubscribeAll { + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -764,6 +862,64 @@ fn main() -> anyhow::Result<()> { hdl.clear_v2b(&req)?; } + Command::SetMcastFwd { underlay, next_hop, replication } => { + // OPTE routes to the next hop's unicast address to determine which + // underlay port to use via the illumos routing table and DDM. + // + // The packet is then sent to the multicast address with a multicast + // MAC. + // + // The switch matches on the outer dst IP (multicast) and Geneve + // `Replication` tag to determine which port groups to replicate to: + // - External: front panel ports (which get decapped on egress) + // - Underlay: underlay ports (sleds) + // - Both: both (bifurcated) + // + // The Replication type is Tx-only, Rx ignores it and delivers + // locally based on subscriptions. + // + // Like `swadm route add`, this command can be run multiple times + // with the same underlay address to add multiple next hops. If the + // same next hop is specified again, its replication mode is updated. + + // Always use fleet-wide DEFAULT_MULTICAST_VNI + let next_hop_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let next_hop_addr = NextHopV6::new(next_hop, next_hop_vni); + let req = SetMcastForwardingReq { + underlay, + next_hops: vec![(next_hop_addr, replication)], + }; + hdl.set_mcast_fwd(&req)?; + } + + Command::ClearMcastFwd { underlay } => { + let req = ClearMcastForwardingReq { underlay }; + hdl.clear_mcast_fwd(&req)?; + } + + Command::DumpMcastFwd => { + print_mcast_fwd(&hdl.dump_mcast_fwd()?)?; + } + + Command::DumpMcastSubs => { + print_mcast_subs(&hdl.dump_mcast_subs()?)?; + } + + Command::McastSubscribe { port, group } => { + let req = McastSubscribeReq { port_name: port, group }; + hdl.mcast_subscribe(&req)?; + } + + Command::McastUnsubscribe { port, group } => { + let req = McastUnsubscribeReq { port_name: port, group }; + hdl.mcast_unsubscribe(&req)?; + } + + Command::McastUnsubscribeAll { group } => { + let req = McastUnsubscribeAllReq { group }; + hdl.mcast_unsubscribe_all(&req)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/illumos-sys-hdrs/src/kernel.rs b/crates/illumos-sys-hdrs/src/kernel.rs index 9ac0c26b..c0d854d4 100644 --- a/crates/illumos-sys-hdrs/src/kernel.rs +++ b/crates/illumos-sys-hdrs/src/kernel.rs @@ -500,6 +500,8 @@ unsafe extern "C" { pub fn freemsg(mp: *mut mblk_t); pub fn freemsgchain(mp: *mut mblk_t); + pub fn msgpullup(mp: *mut mblk_t, n_bytes: isize) -> *mut mblk_t; + pub fn gethrtime() -> hrtime_t; pub fn getmajor(dev: dev_t) -> major_t; diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index 5c0f9986..d69a0a8a 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -25,31 +25,40 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx + ListPorts = 1, // list all ports + AddFwRule = 20, // add firewall rule + RemFwRule = 21, // remove firewall rule + SetFwRules = 22, // set/replace all firewall rules at once + DumpTcpFlows = 30, // dump TCP flows + DumpLayer = 31, // dump the specified Layer + DumpUft = 32, // dump the Unified Flow Table + ListLayers = 33, // list the layers on a given port + ClearUft = 40, // clear the UFT + ClearLft = 41, // clear the given Layer's Flow Table + SetVirt2Phys = 50, // set a v2p mapping + DumpVirt2Phys = 51, // dump the v2p mappings + SetVirt2Boundary = 52, // set a v2b mapping + ClearVirt2Boundary = 53, // clear a v2b mapping + DumpVirt2Boundary = 54, // dump the v2b mappings + ClearVirt2Phys = 55, // clear a v2p mapping + AddRouterEntry = 60, // add a router entry for IP dest + DelRouterEntry = 61, // remove a router entry for IP dest + CreateXde = 70, // create a new xde device + DeleteXde = 71, // delete an xde device + SetXdeUnderlay = 72, // set xde underlay devices + ClearXdeUnderlay = 73, // clear xde underlay devices + SetExternalIps = 80, // set xde external IPs for a port + AllowCidr = 90, // allow ip block through gateway tx/rx + RemoveCidr = 91, // deny ip block through gateway tx/rx + SetMcastForwarding = 100, // set multicast forwarding entries + ClearMcastForwarding = 101, // clear multicast forwarding entries + DumpMcastForwarding = 102, // dump multicast forwarding table + McastSubscribe = 103, // subscribe a port to a multicast group + McastUnsubscribe = 104, // unsubscribe a port from a multicast group + SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) + ClearMcast2Phys = 106, // clear M2P mapping + DumpMcastSubscriptions = 107, // dump multicast subscription table + McastUnsubscribeAll = 108, // unsubscribe all ports from a multicast group } impl TryFrom for OpteCmd { @@ -82,6 +91,15 @@ impl TryFrom for OpteCmd { 80 => Ok(Self::SetExternalIps), 90 => Ok(Self::AllowCidr), 91 => Ok(Self::RemoveCidr), + 100 => Ok(Self::SetMcastForwarding), + 101 => Ok(Self::ClearMcastForwarding), + 102 => Ok(Self::DumpMcastForwarding), + 103 => Ok(Self::McastSubscribe), + 104 => Ok(Self::McastUnsubscribe), + 105 => Ok(Self::SetMcast2Phys), + 106 => Ok(Self::ClearMcast2Phys), + 107 => Ok(Self::DumpMcastSubscriptions), + 108 => Ok(Self::McastUnsubscribeAll), _ => Err(()), } } @@ -177,6 +195,7 @@ pub enum OpteError { dest: IpCidr, target: String, }, + InvalidUnderlayMulticast(String), LayerNotFound(String), MacExists { port: String, @@ -230,6 +249,7 @@ impl OpteError { Self::DeserCmdReq(_) => ENOMSG, Self::FlowExists(_) => EEXIST, Self::InvalidRouterEntry { .. } => EINVAL, + Self::InvalidUnderlayMulticast(_) => EINVAL, Self::LayerNotFound(_) => ENOENT, Self::MacExists { .. } => EEXIST, Self::MaxCapacity(_) => ENFILE, diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 20fffaaa..28480f23 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -141,7 +141,7 @@ impl Display for DhcpReplyType { } } -/// Map a subnet to its next-hop. +/// Map a subnet to its next hop. #[derive(Clone, Copy, Debug)] pub struct SubnetRouterPair { pub subnet: Ipv4Cidr, @@ -307,6 +307,15 @@ pub enum IpAddr { Ip6(Ipv6Addr), } +impl IpAddr { + pub const fn is_multicast(&self) -> bool { + match self { + IpAddr::Ip4(v4) => v4.is_multicast(), + IpAddr::Ip6(v6) => v6.is_multicast(), + } + } +} + impl From for IpAddr { fn from(ipv4: Ipv4Addr) -> Self { IpAddr::Ip4(ipv4) @@ -431,6 +440,10 @@ impl Ipv4Addr { // u32. u32::from_be_bytes(self.bytes()).to_be() } + + pub const fn is_multicast(&self) -> bool { + matches!(self.inner[0], 224..240) + } } impl From for Ipv4Addr { @@ -640,6 +653,29 @@ impl Ipv6Addr { self.inner[0] == 0xFF } + /// Return `true` if this is a multicast IPv6 address with the ff04::/16 prefix + /// (admin-local scope with flags=0) as used by Omicron for underlay multicast. + /// + /// This specifically checks for the ff04::/16 prefix where: + /// - First byte: 0xFF (all multicast addresses) + /// - Second byte: 0x04 (flags=0, scope=4 admin-local) + /// + /// See [RFC 7346] for details on IPv6 multicast address scopes. + /// + /// Omicron allocates multicast addresses from a /64 subnet within ff04::/16 + /// for underlay multicast traffic. Specific underlay IPv6 addresses are sent + /// from Omicron, with uniqueness guaranteed within the allocated /64 subnet. + /// + /// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346.html + pub const fn is_admin_scoped_multicast(&self) -> bool { + if !self.is_multicast() { + return false; + } + + // Check for ff04::/16 prefix only + self.inner[1] == 0x04 + } + /// Return the bytes of the address. pub fn bytes(&self) -> [u8; 16] { self.inner @@ -801,6 +837,92 @@ impl Deref for Ipv6Addr { } } +/// Newtype for underlay IPv6 multicast addresses. +/// +/// This newtype wraps admin-scoped (ff04::/16) IPv6 multicast addresses +/// used for underlay multicast delivery. +#[derive( + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Hash, + Serialize, + Deserialize, +)] +#[serde(try_from = "Ipv6Addr", into = "Ipv6Addr")] +pub struct MulticastUnderlay(Ipv6Addr); + +impl MulticastUnderlay { + /// Create a new `MulticastUnderlay` from an IPv6 address. + /// + /// Returns an error if the address is not an admin-scoped multicast address + /// (ff04::/16 prefix). + pub fn new(addr: Ipv6Addr) -> Result { + if !addr.is_admin_scoped_multicast() { + return Err(format!( + "address must be admin-scoped IPv6 multicast (ff04::/16), got: {addr}" + )); + } + Ok(Self(addr)) + } + + /// Create a new `MulticastUnderlay` without validation. + /// + /// Safety: The caller must ensure that `addr` is an admin-scoped IPv6 + /// multicast address (ff04::/16). Using this with an invalid address + /// violates the type's invariant and may lead to undefined behavior. + /// + /// This is intended for cases where validation has already been performed + /// (e.g., after an explicit `is_admin_scoped_multicast()` check) to avoid + /// redundant validation overhead. + #[inline] + pub const fn new_unchecked(addr: Ipv6Addr) -> Self { + Self(addr) + } + + /// Get the inner IPv6 address. + pub fn addr(&self) -> Ipv6Addr { + self.0 + } +} + +impl FromStr for MulticastUnderlay { + type Err = String; + + /// Parse an IPv6 address string and validate it's admin-scoped multicast. + /// + /// Returns an error if the address is not a valid IPv6 address or if it's + /// not an admin-scoped multicast address (ff04::/16). + fn from_str(val: &str) -> result::Result { + let addr = val.parse::()?; + Self::new(addr) + } +} + +impl Display for MulticastUnderlay { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl TryFrom for MulticastUnderlay { + type Error = String; + + fn try_from(addr: Ipv6Addr) -> result::Result { + Self::new(addr) + } +} + +impl From for Ipv6Addr { + fn from(underlay: MulticastUnderlay) -> Self { + underlay.0 + } +} + /// An IPv4 or IPv6 CIDR. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub enum IpCidr { @@ -989,6 +1111,12 @@ impl Display for Ipv4Cidr { } impl Ipv4Cidr { + /// IPv4 multicast address range, `224.0.0.0/4`. + pub const MCAST: Self = Self { + ip: Ipv4Addr::from_const([224, 0, 0, 0]), + prefix_len: Ipv4PrefixLen(4), + }; + pub fn ip(&self) -> Ipv4Addr { self.parts().0 } @@ -1146,6 +1274,18 @@ impl Ipv6Cidr { prefix_len: Ipv6PrefixLen(64), }; + /// IPv6 multicast address range, `ff00::/8`. + pub const MCAST: Self = Self { + ip: Ipv6Addr::from_const([0xff00, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(8), + }; + + /// IPv6 admin-local multicast scope prefix, `ff04::/16`. + pub const MCAST_ADMIN_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff04, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + pub fn new(ip: Ipv6Addr, prefix_len: Ipv6PrefixLen) -> Self { let ip = ip.safe_mask(prefix_len); Ipv6Cidr { ip, prefix_len } @@ -1468,6 +1608,27 @@ mod test { assert_eq!(addr.solicited_node_multicast(), expected); } + #[test] + fn test_ipv6_admin_scoped_multicast() { + // Test ff04::/16 prefix (admin-local scope used by Omicron) + assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); + assert!(to_ipv6("ff04:1234:5678:9abc::1").is_admin_scoped_multicast()); + + // Test other administrative scopes (NOT accepted) + assert!(!to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local + assert!(!to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local + + // Test non-admin scoped multicast addresses + assert!(!to_ipv6("ff01::1").is_admin_scoped_multicast()); // interface-local + assert!(!to_ipv6("ff02::1").is_admin_scoped_multicast()); // link-local + assert!(!to_ipv6("ff0e::1").is_admin_scoped_multicast()); // global + + // Test non-multicast addresses + assert!(!to_ipv6("fd00::1").is_admin_scoped_multicast()); // ULA + assert!(!to_ipv6("fe80::1").is_admin_scoped_multicast()); // link-local unicast + assert!(!to_ipv6("2001:db8::1").is_admin_scoped_multicast()); // global unicast + } + #[test] fn dhcp_fqdn() { let no_host = DhcpCfg { hostname: None, ..Default::default() }; @@ -1498,4 +1659,34 @@ mod test { domain_no_host.push_fqdn(&mut space); assert!(space.is_empty()); } + + #[test] + fn test_multicast_underlay_serde() { + // Test valid admin-scoped address (ff04::/16) + let valid_addr = to_ipv6("ff04::1"); + let underlay = MulticastUnderlay::new(valid_addr).unwrap(); + + // Serialize with postcard (the serialization format used in opte-api) + let serialized = postcard::to_allocvec(&underlay).unwrap(); + + // Deserialize - should succeed + let deserialized: MulticastUnderlay = + postcard::from_bytes(&serialized).unwrap(); + assert_eq!(deserialized.addr(), valid_addr); + + // Test invalid address (not admin-scoped) - should fail deserialization + let invalid_addr = to_ipv6("ff05::1"); // site-local, not admin-scoped + let serialized_invalid = postcard::to_allocvec(&invalid_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_invalid); + assert!(result.is_err()); + + // Test non-multicast address - should fail deserialization + let non_mcast_addr = to_ipv6("fd00::1"); + let serialized_non_mcast = + postcard::to_allocvec(&non_mcast_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_non_mcast); + assert!(result.is_err()); + } } diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 7176e7a5..558a6e41 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 37; +pub const API_VERSION: u64 = 38; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/crates/opte-api/src/mac.rs b/crates/opte-api/src/mac.rs index 1818a997..1134ae6d 100644 --- a/crates/opte-api/src/mac.rs +++ b/crates/opte-api/src/mac.rs @@ -55,6 +55,19 @@ impl MacAddr { pub const fn from_const(bytes: [u8; 6]) -> Self { Self { inner: bytes } } + + /// Return whether this MAC address is a group address (I/G bit set). + /// + /// Per IEEE 802, the I/G (Individual/Group) bit is the LSB of the first octet. + /// When set to 1, the address is a group address, which includes both + /// multicast and broadcast (FF:FF:FF:FF:FF:FF) addresses. + /// + /// See [RFC 7042 §2.1] for details on IEEE 802 MAC address structure. + /// + /// [RFC 7042 §2.1]: https://www.rfc-editor.org/rfc/rfc7042#section-2.1 + pub const fn is_group(&self) -> bool { + (self.inner[0] & 0b0000_0001) != 0 + } } impl From for smoltcp::wire::EthernetAddress { diff --git a/dtrace/README.adoc b/dtrace/README.adoc index 400d1f44..276672bf 100644 --- a/dtrace/README.adoc +++ b/dtrace/README.adoc @@ -64,7 +64,15 @@ a|`opte-rule-match.d` a|`opte-tcp-flow-state.d` |Track the TCP flow state changes as they happen. Printing the state - transition as well as the flow ID. +transition as well as the flow ID. + +a|`opte-mcast-delivery.d` +|Track multicast Tx/Rx, local same-sled delivery, underlay forwarding, and + external forwarding. Also tracks multicast control-plane operations (map + set/clear, fwd set/clear, subscribe/unsubscribe, and dumps) to help correlate + config changes with dataplane events. Optional toggles are in the script's + BEGIN block: `flow_debug` (adds xde_mc_tx entry/return), `suppress_output` + (suppress per-event output), and `show_summary` (show aggregations at END). a|`opte-uft-invalidate.d` |Track Unified Flow Table invalidation as it happens. A UFT entry is diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d new file mode 100644 index 00000000..7ed9d3c6 --- /dev/null +++ b/dtrace/opte-mcast-delivery.d @@ -0,0 +1,427 @@ +/* + * Track multicast packet delivery through OPTE/XDE. + * + * Usage: + * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + * + * Configuration (set in BEGIN block): + * suppress_output = 1 - Suppress per-event output, show only aggregations + * flow_debug = 1 - Enable multicast Tx/Rx function entry/exit tracing + * show_summary = 1 - Show aggregated summary at END (default: enabled) + */ +#include "common.h" + +/* Local print formats (avoid colliding with common.h FLOW_FMT macros) */ +#define M_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define M_FWD_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_FWD_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define DBG_LINE_FMT "%-20s %-30s %s\n" + +/* Macro to reduce code duplication for group address formatting */ +#define MCAST_GROUP_STR(af, ptr) \ + ((af) == AF_INET ? inet_ntoa((ipaddr_t *)(ptr)) : \ + inet_ntoa6((in6_addr_t *)(ptr))) + +/* Configurable header reprint interval */ +#define HEADER_REPRINT_INTERVAL 10 + +/* + * OPTE command numbers for multicast-related ioctls (see crates/opte-api/src/cmd.rs). + */ +#define CMD_SET_MCAST_FWD 100 +#define CMD_CLEAR_MCAST_FWD 101 +#define CMD_DUMP_MCAST_FWD 102 +#define CMD_MCAST_SUBSCRIBE 103 +#define CMD_MCAST_UNSUBSCRIBE 104 +#define CMD_SET_M2P 105 +#define CMD_CLEAR_M2P 106 +#define CMD_DUMP_MCAST_SUBS 107 +#define CMD_MCAST_UNSUBSCRIBE_ALL 108 + +BEGIN { + flow_debug = 0; /* Set to 1 to enable detailed flow debugging */ + suppress_output = 0; /* Set to 1 to suppress per-event output (aggregations only) */ + show_summary = 1; /* Set to 1 to show aggregated summary at END */ + + num = 0; + + printf("OPTE Multicast Delivery Tracker\n"); + printf("Configuration:\n"); + printf(" flow_debug = %d\n", flow_debug); + printf(" suppress_output = %d\n", suppress_output); + printf(" show_summary = %d\n", show_summary); + printf("\n"); +} + +BEGIN +/!suppress_output/ +{ + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); +} + +/* Multicast Tx function entry/exit (optional detailed debugging) */ +xde_mc_tx:entry +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-entry", "", ""); +} + +xde_mc_tx:return +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-return", "", ""); +} + +mcast-tx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["TX"] = count(); + @by_vni["TX", this->vni] = count(); + @by_group["TX", this->group_str] = count(); +} + +mcast-tx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "TX", this->vni, this->group_str, "-"); + num++; +} + +mcast-rx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["RX"] = count(); + @by_vni["RX", this->vni] = count(); + @by_group["RX", this->group_str] = count(); +} + +mcast-rx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "RX", this->vni, this->group_str, "-"); + num++; +} + +mcast-local-delivery { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=port */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->port = stringof(arg3); + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["DELIVER"] = count(); + @by_vni["DELIVER", this->vni] = count(); + @by_port[this->port] = count(); + @by_group["DELIVER", this->group_str] = count(); +} + +mcast-local-delivery +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "DELIVER", this->vni, this->group_str, this->port); + num++; +} + +mcast-underlay-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ + this->af = arg0; + this->underlay_ptr = arg1; + this->vni = arg2; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); + + /* Always track aggregations (even when suppressing output) */ + @by_event["UNDERLAY"] = count(); + @by_vni["UNDERLAY", this->vni] = count(); + @by_underlay["UNDERLAY", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-underlay-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); + num = 0; + } + + printf(M_FWD_LINE_FMT, "UNDERLAY", this->vni, this->underlay_str, this->next_hop_str); + num++; +} + +mcast-external-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ + this->af = arg0; + this->underlay_ptr = arg1; + this->vni = arg2; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); + + /* Always track aggregations (even when suppressing output) */ + @by_event["EXTERNAL"] = count(); + @by_vni["EXTERNAL", this->vni] = count(); + @by_underlay["EXTERNAL", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-external-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); + num = 0; + } + + printf(M_FWD_LINE_FMT, "EXTERNAL", this->vni, this->underlay_str, this->next_hop_str); + num++; +} + +/* Control-plane config operations via ioctl */ +xde_ioc_opte_cmd:entry +{ + this->ioc = (opte_cmd_ioctl_t *)arg0; + this->cmd = this->ioc->cmd; + /* Only track multicast-related commands */ + this->name = + this->cmd == CMD_SET_M2P ? "CFG SET_M2P" : + this->cmd == CMD_CLEAR_M2P ? "CFG CLEAR_M2P" : + this->cmd == CMD_SET_MCAST_FWD ? "CFG SET_FWD" : + this->cmd == CMD_CLEAR_MCAST_FWD ? "CFG CLEAR_FWD" : + this->cmd == CMD_DUMP_MCAST_FWD ? "CFG DUMP_FWD" : + this->cmd == CMD_DUMP_MCAST_SUBS ? "CFG DUMP_SUBS" : + this->cmd == CMD_MCAST_SUBSCRIBE ? "CFG SUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE ? "CFG UNSUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE_ALL ? "CFG UNSUB_ALL" : + NULL; + + /* Always track aggregations for multicast ops */ + if (this->name != NULL) { + @cfg_counts[this->name] = count(); + } +} + +xde_ioc_opte_cmd:entry +/!suppress_output && this->name != NULL/ +{ + printf(DBG_LINE_FMT, this->name, "", ""); +} + +/* Dedicated control-plane probes (if present) */ +mcast-map-set { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_SET"] = count(); +} + +mcast-map-set +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-SET", this->vni, this->group, this->ul); +} + +mcast-map-clear { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_CLEAR"] = count(); +} + +mcast-map-clear +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-CLEAR", this->vni, this->group, this->ul); +} + +mcast-fwd-set { + /* arg0=underlay_ptr, arg1=count, arg2=vni */ + this->underlay = (in6_addr_t *)arg0; + this->count = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["FWD_SET"] = count(); +} + +mcast-fwd-set +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-SET", this->vni, "-", this->ul); +} + +mcast-fwd-clear { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @cfg_counts["FWD_CLEAR"] = count(); +} + +mcast-fwd-clear +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-CLEAR", this->vni, "-", this->ul); +} + +mcast-subscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["SUBSCRIBE"] = count(); +} + +mcast-subscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "SUBSCRIBE", this->vni, this->group, this->port); +} + +mcast-unsubscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["UNSUBSCRIBE"] = count(); +} + +mcast-unsubscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUBSCR", this->vni, this->group, this->port); +} + +mcast-unsubscribe-all { + /* arg0=af, arg1=group_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["UNSUB_ALL"] = count(); +} + +mcast-unsubscribe-all +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUB_ALL", this->vni, this->group, "ALL"); +} + +/* Dataplane failure probes */ +mcast-tx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["TX_FAIL"] = count(); +} + +mcast-tx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "TX_FAIL", 0, "-", "-"); +} + +mcast-rx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["RX_FAIL"] = count(); +} + +mcast-rx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "RX_FAIL", 0, "-", "-"); +} + +mcast-no-fwd-entry { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @by_event["NOFWD"] = count(); +} + +mcast-no-fwd-entry +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "NOFWD", this->vni, "-", this->ul); +} + +/* Print aggregated summary when the script ends (if enabled) */ +END +/show_summary/ +{ + printf("\nSummary by event:\n"); + printa(@by_event); + printf("\nSummary by event and VNI:\n"); + printa(@by_vni); + printf("\nSummary by overlay group (TX/RX/DELIVER):\n"); + printa(@by_group); + printf("\nSummary by underlay multicast address (UNDERLAY/EXTERNAL):\n"); + printa(@by_underlay); + printf("\nLocal delivery by port:\n"); + printa(@by_port); + printf("\nForwarding by unicast next hop (routing address):\n"); + printa(@by_nexthop_unicast); + printf("\nConfig ops:\n"); + printa(@cfg_counts); +} diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index c896ce4b..510fc9a1 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -27,6 +27,8 @@ use opte::api::XDE_IOC_OPTE_CMD; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AllowCidrReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; @@ -34,15 +36,22 @@ use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrReq; use oxide_vpc::api::RemoveCidrResp; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::VpcCfg; @@ -205,6 +214,16 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) } + pub fn set_m2p(&self, req: &SetMcast2PhysReq) -> Result { + let cmd = OpteCmd::SetMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + pub fn clear_m2p(&self, req: &ClearMcast2PhysReq) -> Result { + let cmd = OpteCmd::ClearMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + pub fn set_v2b(&self, req: &SetVirt2BoundaryReq) -> Result { let cmd = OpteCmd::SetVirt2Boundary; run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) @@ -224,6 +243,63 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) } + /// Set a multicast forwarding entry. + pub fn set_mcast_fwd( + &self, + req: &SetMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::SetMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Clear a multicast forwarding entry. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::ClearMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Dump the multicast forwarding table. + pub fn dump_mcast_fwd(&self) -> Result { + let cmd = OpteCmd::DumpMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Dump the multicast subscription table (group -> ports on this sled). + pub fn dump_mcast_subs(&self) -> Result { + let cmd = OpteCmd::DumpMcastSubscriptions; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( + &self, + req: &McastSubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastSubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe all ports from a multicast group. + pub fn mcast_unsubscribe_all( + &self, + req: &McastUnsubscribeAllReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribeAll; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + /// Set xde underlay devices. pub fn set_xde_underlay( &self, diff --git a/lib/opte-test-utils/Cargo.toml b/lib/opte-test-utils/Cargo.toml index 2236b8a8..0163aa46 100644 --- a/lib/opte-test-utils/Cargo.toml +++ b/lib/opte-test-utils/Cargo.toml @@ -10,6 +10,7 @@ repository.workspace = true usdt = ["oxide-vpc/usdt"] [dependencies] +anyhow.workspace = true opte = { workspace = true, features = ["std"] } oxide-vpc = { workspace = true, features = ["engine", "std", "test-help"] } pcap-parser = { workspace = true, features = ["serialize"] } diff --git a/lib/opte-test-utils/src/geneve_verify.rs b/lib/opte-test-utils/src/geneve_verify.rs new file mode 100644 index 00000000..8d193228 --- /dev/null +++ b/lib/opte-test-utils/src/geneve_verify.rs @@ -0,0 +1,185 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Module to parse and verify Geneve headers from snoop hex output. +//! +//! This uses the existing OPTE/ingot Geneve types to parse raw packet bytes +//! and extract key multicast-related fields for test assertions. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use opte::engine::geneve::Vni; +use opte::engine::ip::v6::Ipv6Ref; +use opte::engine::parse::ValidGeneveOverV6; +use opte::ingot::geneve::GeneveRef; +use opte::ingot::types::HeaderParse; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Replication; +use oxide_vpc::engine::geneve::extract_multicast_replication; + +/// Parsed Geneve header information for test verification. +pub struct GeneveInfo { + pub vni: Vni, + pub outer_ipv6_dst: Ipv6Addr, + pub replication: Option, +} + +/// Parse a Geneve/IPv6 packet from raw bytes and extract multicast-related +/// fields. +/// +/// Returns VNI, outer IPv6 destination, and replication mode from Geneve +/// options. +pub fn parse_geneve_packet(bytes: &[u8]) -> Result { + let (pkt, _, _) = ValidGeneveOverV6::parse(bytes) + .context("Failed to parse Geneve/IPv6 packet")?; + + let vni = pkt.outer_encap.vni(); + let outer_ipv6_dst = pkt.outer_v6.destination(); + let replication = extract_multicast_replication(&pkt.outer_encap); + + Ok(GeneveInfo { vni, outer_ipv6_dst, replication }) +} + +/// Parse hex string from snoop output into bytes. +/// +/// Snoop output with `-x0` flag is hex digits without separators: +/// "ffffffffffff001122334455..." +pub fn parse_snoop_hex(hex_str: &str) -> Result> { + hex_str + .as_bytes() + .chunks(2) + .map(|chunk| { + let hex_byte = + std::str::from_utf8(chunk).context("Invalid UTF-8")?; + u8::from_str_radix(hex_byte, 16).context("Invalid hex") + }) + .collect() +} + +/// Extract snoop hex output from command output. +/// +/// We support common `snoop -P -x0` formats: +/// - Lines of contiguous hex digits (with or without spaces). +/// - Hex dumps with an offset prefix like `0:` or `0000:` followed by +/// groups of hex digits (2/4/8/16 chars). +/// +/// To avoid false positives from summary lines (e.g., "UDP port 6081"), the +/// tokenized fallback triggers only for lines that look like offset-prefixed +/// hex dumps. +pub fn extract_snoop_hex(snoop_output: &str) -> Result { + let mut hex_bytes = String::new(); + + for line in snoop_output.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.contains("Using device") { + continue; + } + + // Case 1: entire line is hex digits + whitespace (e.g., "aa bb cc ..." or + // single long line of hex). Remove whitespace and append. + if trimmed.chars().all(|c| c.is_ascii_hexdigit() || c.is_whitespace()) { + for ch in trimmed.chars().filter(|c| c.is_ascii_hexdigit()) { + hex_bytes.push(ch); + } + continue; + } + + // Case 2: offset-prefixed hexdump lines (e.g., "0: 4500 003c ..."). + // Only consider tokenized parsing if the first token looks like an + // offset (decimal or hex) ending with a ':' to avoid pulling numbers + // from summary lines. + let mut tokens = trimmed.split_whitespace(); + let Some(first) = tokens.next() else { continue }; + if !first.ends_with(':') { + continue; // Not a hexdump line + } + let mut off = first.trim_end_matches(':'); + if off.starts_with("0x") || off.starts_with("0X") { + off = &off[2..]; + } + if !off.chars().all(|c| c.is_ascii_hexdigit()) { + continue; // Not a valid offset + } + + for tok in tokens { + let mut t = tok.trim_end_matches(':'); + if t.len() > 2 && (t.starts_with("0x") || t.starts_with("0X")) { + t = &t[2..]; + } + if t.is_empty() { + continue; + } + // Accept groups commonly used in dumps: bytes (2), words (4), dwords (8), + // or qwords (16). Ignore anything else to avoid accidental matches. + let len = t.len(); + if matches!(len, 2 | 4 | 8 | 16) + && t.chars().all(|c| c.is_ascii_hexdigit()) + { + hex_bytes.push_str(t); + } + } + } + + if hex_bytes.is_empty() { + bail!("No hex data found in snoop output"); + } + + // Ensure even number of nibbles to form complete bytes. + if hex_bytes.len() % 2 == 1 { + hex_bytes.pop(); + } + + Ok(hex_bytes) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_contiguous_hex() { + let input = "deadbeefCAFEBABE"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "deadbeefCAFEBABE"); + let bytes = parse_snoop_hex(&out).unwrap(); + assert_eq!(bytes, vec![0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe]); + } + + #[test] + fn extract_bytes_with_spaces() { + let input = "45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_words() { + let input = "0: 4500 003c 1c46 4000"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn extract_offset_bytes() { + let input = "0: 45 00 00 3c 1c 46 40 00"; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + } + + #[test] + fn ignore_summary_numbers() { + let input = r#" +Using device xde_test_sim1 (promiscuous) +UDP: fe80::1 > ff04::224.1.2.3, port 6081 +0: 4500 003c 1c46 4000 +"#; + let out = extract_snoop_hex(input).unwrap(); + assert_eq!(out, "4500003c1c464000"); + // Should not accidentally include "6081" + assert!(!out.contains("6081")); + } +} diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index a4f3cb7b..bb128b44 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -10,6 +10,7 @@ #![allow(dead_code)] pub mod dhcp; +pub mod geneve_verify; pub mod icmp; pub mod pcap; #[macro_use] @@ -84,6 +85,7 @@ pub use oxide_vpc::engine::gateway; pub use oxide_vpc::engine::geneve::OxideOptionType; pub use oxide_vpc::engine::nat; pub use oxide_vpc::engine::overlay; +pub use oxide_vpc::engine::overlay::Mcast2Phys; pub use oxide_vpc::engine::overlay::TUNNEL_ENDPOINT_MAC; pub use oxide_vpc::engine::overlay::Virt2Boundary; pub use oxide_vpc::engine::overlay::Virt2Phys; @@ -254,6 +256,7 @@ fn oxide_net_builder( cfg: &oxide_vpc::cfg::VpcCfg, vpc_map: Arc, v2p: Arc, + m2p: Arc, v2b: Arc, ) -> PortBuilder { #[allow(clippy::arc_with_non_send_sync)] @@ -272,7 +275,7 @@ fn oxide_net_builder( .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, v2p, v2b, one_limit) + overlay::setup(&pb, cfg, v2p, m2p, v2b, one_limit) .expect("failed to add overlay layer"); pb } @@ -281,6 +284,7 @@ pub struct PortAndVps { pub port: Port, pub vps: VpcPortState, pub vpc_map: Arc, + pub m2p: Arc, pub cfg: oxide_vpc::cfg::VpcCfg, } @@ -346,6 +350,7 @@ pub fn oxide_net_setup2( let vpc_net = VpcNetwork { cfg: converted_cfg.clone() }; let uft_limit = flow_table_limits.unwrap_or(UFT_LIMIT.unwrap()); let tcp_limit = flow_table_limits.unwrap_or(TCP_LIMIT.unwrap()); + let m2p = Arc::new(Mcast2Phys::new()); let v2b = Arc::new(Virt2Boundary::new()); v2b.set( "0.0.0.0/0".parse().unwrap(), @@ -362,10 +367,16 @@ pub fn oxide_net_setup2( }], ); - let port = - oxide_net_builder(name, &converted_cfg, vpc_map.clone(), port_v2p, v2b) - .create(vpc_net, uft_limit, tcp_limit) - .unwrap(); + let port = oxide_net_builder( + name, + &converted_cfg, + vpc_map.clone(), + port_v2p, + m2p.clone(), + v2b, + ) + .create(vpc_net, uft_limit, tcp_limit) + .unwrap(); // Add router entry that allows the guest to send to other guests // on same subnet. @@ -378,34 +389,36 @@ pub fn oxide_net_setup2( .unwrap(); let vps = VpcPortState::new(); - let mut pav = PortAndVps { port, vps, vpc_map, cfg: converted_cfg }; + let mut pav = PortAndVps { port, vps, vpc_map, m2p, cfg: converted_cfg }; let mut updates = vec![ // * Epoch starts at 1, adding router entry bumps it to 2. "set:epoch=2", - // * Allow inbound IPv6 traffic for guest. - // * Allow inbound IPv4 traffic for guest. + // * Allow inbound IPv4 unicast traffic for guest. + // * Allow inbound IPv4 multicast traffic for guest. + // * Allow inbound IPv6 unicast traffic for guest. + // * Allow inbound IPv6 multicast traffic for guest. // * Deny inbound NDP for guest. - "set:gateway.rules.in=3", + "set:gateway.rules.in=5", // IPv4 // ---- // // * ARP Gateway MAC addr // * ICMP Echo Reply for Gateway - // * DHCP Offer - // * DHCP Ack - // * Outbound traffic from Guest IP + MAC address + // * DHCP Discover → Offer hairpin + // * DHCP Request → Ack hairpin + // * Outbound no-spoof from Guest IP + MAC (allows unicast and multicast) // // IPv6 // ---- // - // * NDP NA for Gateway - // * NDP RA for Gateway - // * Deny all other NDP - // * ICMPv6 Echo Reply for Gateway from Guest Link-Local // * ICMPv6 Echo Reply for Gateway from Guest VPC ULA + // * ICMPv6 Echo Reply for Gateway from Guest Link-Local + // * NDP RA for Gateway + // * NDP NA for Gateway // * DHCPv6 - // * Outbound traffic from Guest IPv6 + MAC Address + // * Deny all other NDP + // * Outbound no-spoof from Guest IPv6 + MAC (allows unicast and multicast) "set:gateway.rules.out=12", // * Allow all outbound traffic "set:firewall.rules.out=0", @@ -429,11 +442,13 @@ pub fn oxide_net_setup2( }); updates.extend_from_slice(&[ + // * Multicast passthrough (handles both IPv4 and IPv6) // * Allow guest to route to own subnet - "set:router.rules.out=1", + "set:router.rules.out=2", // * Outbound encap // * Inbound decap - "set:overlay.rules.in=1, overlay.rules.out=1", + // * Inbound VNI validator (multicast) + "set:overlay.rules.in=2, overlay.rules.out=1", ]); if let Some(val) = custom_updates { diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 3bf6fe79..c309a2f0 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -209,11 +209,74 @@ resources. Pausing, Saving, & Restoring:: A port may be paused, saved, and restored for the purpose of live migration. The pausing of a state allows it to halt all packet processing and quiesce to a steady state. -In this state is is then possible to save the port's state which has +In this state it is then possible to save the port's state which has all data needed to restart the port without rebuilding the entire flow state. This is achieved by restoring the port based on some payload of save data. +=== Multicast + +OPTE implements multicast consistent with the rack networking +architecture described in [RFD 63](https://rfd.shared.oxide.computer/rfd/0063) +and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). + +==== Fleet VNI + +All multicast traffic uses a single fleet‑level Geneve VNI +(`DEFAULT_MULTICAST_VNI`, currently `77`) rather than per‑tenant VNIs. +Mappings from overlay multicast groups to underlay multicast addresses +are stored and validated under this VNI. (See `RFD 488` for the rationale behind +fleet-level VNI.) + +==== Delivery Modes and Replication + +The `Replication` type is a Tx‑only instruction to switches encoded in the Oxide Geneve +multicast option as a 2‑bit field in the top two bits of the option body's first byte. +It tells the switch which ports to replicate the frame to on transmission. On Rx, OPTE +ignores the replication field and performs local same‑sled delivery based purely on +subscriptions. The replication mode is not an access control mechanism. + +OPTE always performs local same‑sled delivery for all replication modes and acts as a leaf: + +* _External_ replicates to ports set for external multicast traffic. Switch decaps + and replicates to front panel ports (egress to external networks, leaving the underlay). + OPTE does not create additional multicast copies for other sleds. +* _Underlay_ replicates to ports set for underlay multicast traffic. Switch replicates + to other sleds (using the underlay). The underlay network performs further replication + within the rack. +* _Both_ replicates to both port groups (bifurcated). Combines `External` and `Underlay`: + switch replicates to both front panel and underlay ports. + +For all replication modes, OPTE routes to the next hop's unicast address to determine +reachability and underlay port/MAC. The packet destination (outer IPv6) is the multicast +address from M2P with multicast MAC (RFC 2464). All multicast uses fleet VNI 77. + +==== Encapsulation Path + +On Tx, the overlay layer encapsulates packets destined for multicast groups +with a Geneve multicast option initially set to `External` replication mode. +XDE's multicast Tx path (`xde_mc_tx`) first delivers the packet locally to +all other ports on the same sled that have subscribed to the multicast group +(within the same VNI), then consults the multicast forwarding table. + +For each next hop in the forwarding table, XDE creates a packet copy and updates its +Geneve multicast option to match that next hop's configured replication mode. +XDE routes to the next hop's unicast address (for all replication modes) to determine +reachability and which underlay port/MAC to use. The packet destination (outer IPv6) +is the multicast address from M2P with multicast MAC (RFC 2464). The Geneve replication +option serves as a Tx-only instruction telling switches which port groups to replicate to. + +==== Rx Behavior + +OPTE acts as a leaf node and does not relay multicast traffic received from the underlay. + +Constraints & Validation:: + +* M2P (multicast‑to‑physical) mappings must use `DEFAULT_MULTICAST_VNI`. +* Any next hop that causes underlay forwarding must specify VNI 77. +* Underlay multicast addresses must be IPv6 admin-local multicast (`ff04::/16`) + as defined in https://www.rfc-editor.org/rfc/rfc7346.html[RFC 7346]. + === Layers The main function of the port is to process packets in a flow-based diff --git a/lib/opte/src/api.rs b/lib/opte/src/api.rs index d5d9431f..199fa8b5 100644 --- a/lib/opte/src/api.rs +++ b/lib/opte/src/api.rs @@ -278,3 +278,9 @@ pub type DumpLayerResp = opte_api::DumpLayerResp; pub type DumpUftResp = opte_api::DumpUftResp; pub type DumpTcpFlowsResp = opte_api::DumpTcpFlowsResp; pub type TcpFlowEntryDump = opte_api::TcpFlowEntryDump; + +// Implement ResourceEntry for MulticastUnderlay when the engine feature is enabled. +// This allows MulticastUnderlay to be used as a MappingResource::Entry in the +// Mcast2Phys table (see oxide-vpc/engine/overlay.rs). +#[cfg(feature = "engine")] +impl crate::engine::rule::ResourceEntry for MulticastUnderlay {} diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 487507f9..e52742c2 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -16,6 +16,7 @@ use core::cmp::Ordering; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; +use core::num::NonZeroUsize; use core::ops::Deref; use core::ops::DerefMut; use core::ptr; @@ -301,6 +302,68 @@ impl MsgBlk { out } + /// Copy the first `n` bytes of this packet into a new `mblk_t`, + /// increasing the refcount of all remaining segments. + /// + /// On non-kernel platforms this will simple clone the underlying packet + /// with the desired segmentation. + pub fn pullup( + &self, + n: Option, + ) -> Result { + let totlen = self.byte_len(); + + if let Some(n) = n + && n.get() > totlen + { + // The DDI function will bail out if this is the case, but + // we'll be none the wiser to *what* the failure mode was. + return Err(PktPullupError::TooLong); + } + + cfg_if! { + if #[cfg(all(not(feature = "std"), not(test)))] { + let out = unsafe { + ddi::msgpullup( + self.0.as_ptr(), + n.map(|v| v.get() as isize).unwrap_or(-1), + ) + }; + + let mp = NonNull::new(out) + .ok_or(PktPullupError::AllocFailed)?; + + Ok(Self(mp)) + } else { + // We aren't (currently?) simulating refcount tracking at all + // in our userland mblk abstraction. + // Do the segmentation right, but otherwise it's fully cloned. + let to_ensure = n.map(|v| v.get()).unwrap_or(totlen); + let mut top_mblk = MsgBlk::new(to_ensure); + let mut still_to_write = to_ensure; + + for chunk in self.iter() { + let mut left_in_chunk = chunk.len(); + let to_take = chunk.len().min(still_to_write); + + if still_to_write != 0 { + top_mblk.write_bytes_back(&chunk[..to_take]) + .expect("to_take should be <= remaining capacity"); + } + + still_to_write -= to_take; + left_in_chunk -= to_take; + + if left_in_chunk != 0 { + top_mblk.append(MsgBlk::copy(&chunk[to_take..])); + } + } + + Ok(top_mblk) + } + } + } + /// Creates a new [`MsgBlk`] using a given set of packet headers. pub fn new_pkt(emit: impl Emit + EmitDoesNotRelyOnBufContents) -> Self { let mut pkt = Self::new(emit.packet_length()); @@ -1035,6 +1098,26 @@ impl core::fmt::Display for PktInfoError { } } +/// Reasons a [`MsgBlk`] could not be pulled up. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] +pub enum PktPullupError { + /// Requested pullup was longer than the underlying packet. + TooLong, + /// The OS was unable to allocate a [`MsgBlk`]. + AllocFailed, +} + +impl core::error::Error for PktPullupError {} + +impl core::fmt::Display for PktPullupError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(match self { + Self::TooLong => "requested pullup is longer than packet", + Self::AllocFailed => "failed to allocate an mblk_t", + }) + } +} + /// Counts the number of segments in an `mblk_t` from `head`, linked /// via `b_cont`. unsafe fn count_mblk_chain(mut head: Option>) -> usize { diff --git a/lib/opte/src/engine/geneve.rs b/lib/opte/src/engine/geneve.rs index 7f5e958e..382f8927 100644 --- a/lib/opte/src/engine/geneve.rs +++ b/lib/opte/src/engine/geneve.rs @@ -383,6 +383,15 @@ impl<'a, T: OptionCast<'a>> GeneveOptionParse<'a, T> { } } +impl<'a, T: OptionCast<'a>> HeaderLen for GeneveOptionParse<'a, T> { + const MINIMUM_LENGTH: usize = GeneveOpt::MINIMUM_LENGTH; + + fn packet_length(&self) -> usize { + // Option header (4 bytes) + body length (already padded to 4-byte boundary) + GeneveOpt::MINIMUM_LENGTH + self.body_remainder.len() + } +} + /// Marks whather a Geneve option has been successfuly interpreted as a known /// variant. pub enum Known { diff --git a/lib/opte/src/engine/predicate.rs b/lib/opte/src/engine/predicate.rs index 551f2179..4527efe9 100644 --- a/lib/opte/src/engine/predicate.rs +++ b/lib/opte/src/engine/predicate.rs @@ -97,12 +97,15 @@ impl Display for EtherTypeMatch { #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub enum EtherAddrMatch { Exact(MacAddr), + /// Match any multicast/broadcast MAC address (LSB of first octet is 1). + Multicast, } impl EtherAddrMatch { fn matches(&self, flow_addr: MacAddr) -> bool { match self { EtherAddrMatch::Exact(addr) => flow_addr == *addr, + EtherAddrMatch::Multicast => flow_addr.is_group(), } } } @@ -113,6 +116,7 @@ impl Display for EtherAddrMatch { match self { Exact(addr) => write!(f, "{addr}"), + Multicast => write!(f, "multicast"), } } } diff --git a/lib/opte/src/lib.rs b/lib/opte/src/lib.rs index 6de57220..6c62d544 100644 --- a/lib/opte/src/lib.rs +++ b/lib/opte/src/lib.rs @@ -200,7 +200,7 @@ mod opte_provider { /// /// Logging levels are provided by [`LogLevel`]. These levels will map /// to the underlying provider with varying degrees of success. -pub trait LogProvider { +pub trait LogProvider: Send + Sync { /// Log a message at the specified level. fn log(&self, level: LogLevel, msg: &str); } diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index b1e82e62..8c67ec25 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,6 +20,70 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; +/// Tx-only instruction to switches for multicast packet replication. +/// +/// Tells the switch which port groups to replicate outbound multicast packets +/// to. It is a transmit-only setting - on Rx, OPTE ignores the replication +/// field and performs local same-sled delivery based purely on subscriptions. +/// The replication mode is not an access control mechanism. +/// +/// Routing vs replication: OPTE routes to the [`NextHopV6::addr`] (switch's +/// unicast address) for all modes to determine reachability and which underlay +/// port/MAC to use. +/// +/// The packet destination (outer IPv6) is the multicast address from M2P. This +/// [`Replication`] value tells the switch which port groups to replicate to. +/// +/// - `External`: Switch decaps and replicates to external-facing ports only +/// - `Underlay`: Switch replicates to underlay ports (other sleds) only +/// - `Both`: Switch replicates to both external and underlay ports (bifurcated) +/// +/// Encoding: The Geneve Oxide multicast option encodes the replication strategy +/// in the top 2 bits of the option body's first byte (u2). The remaining 30 +/// bits are reserved. +/// +/// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) +/// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). +#[derive( + Clone, Copy, Debug, Default, Serialize, Deserialize, Eq, PartialEq, Hash, +)] +#[repr(u8)] +pub enum Replication { + /// Replicate packets to ports set for external multicast traffic. + /// + /// Switch decaps and replicates to front panel ports (egress to external + /// networks, leaving the underlay). + #[default] + External = 0x00, + /// Replicate packets to ports set for underlay multicast traffic. + /// + /// Switch replicates to sleds (using the underlay). + Underlay = 0x01, + /// Replicate packets to ports set for underlay and external multicast traffic (bifurcated). + /// + /// Switch replicates to both front panel ports (egress to external networks) and sleds. + Both = 0x02, + /// Reserved for future use. This value exists to account for all possible + /// values in the 2-bit Geneve option field. + Reserved = 0x03, +} + +#[cfg(any(feature = "std", test))] +impl FromStr for Replication { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "external" => Ok(Self::External), + "underlay" => Ok(Self::Underlay), + "both" => Ok(Self::Both), + lower => Err(format!( + "unexpected replication {lower} -- expected 'external', 'underlay', or 'both'" + )), + } + } +} + /// This is the MAC address that OPTE uses to act as the virtual gateway. pub const GW_MAC_ADDR: MacAddr = MacAddr::from_const([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]); @@ -27,6 +91,19 @@ pub const GW_MAC_ADDR: MacAddr = /// tunnel endpoint. pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; +/// Default VNI for rack-wide multicast groups (no VPC association). +/// Must match Omicron's DEFAULT_MULTICAST_VNI. +/// +/// This is the only VNI currently supported for multicast traffic. +/// All multicast groups (M2P mappings and forwarding entries) must use this VNI. +/// OPTE validates that multicast operations specify this VNI and rejects others. +/// +/// While M2P (Multicast-to-Physical) mappings are stored +/// per-VNI in the code, the enforcement of DEFAULT_MULTICAST_VNI means all +/// multicast traffic shares a single namespace across the rack, with no +/// VPC-level isolation (as multicast groups are fleet-wide) *as of now*. +pub const DEFAULT_MULTICAST_VNI: u32 = 77u32; + /// Description of Boundary Services, the endpoint used to route traffic /// to external networks. // @@ -303,6 +380,34 @@ pub struct PhysNet { pub vni: Vni, } +/// Represents an IPv6 next hop for multicast forwarding. +/// +/// OPTE routes to [`NextHopV6::addr`] (the switch's unicast address) for all +/// replication modes to determine reachability and which underlay port/MAC to +/// use. The packet destination (outer IPv6) is always the multicast address +/// from M2P. The associated [`Replication`] mode is a Tx-only instruction +/// telling the switch which port groups to replicate to on transmission. +/// Routing is always to the unicast next hop. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, +)] +pub struct NextHopV6 { + /// The unicast IPv6 address of the switch endpoint (for routing). + /// This determines which underlay port and source MAC to use. + /// The actual packet destination (outer IPv6) is the multicast address. + pub addr: Ipv6Addr, + /// The VNI to use for Geneve encapsulation. + /// Currently must be DEFAULT_MULTICAST_VNI (77). + /// Future: could support per-VPC VNIs for multicast isolation. + pub vni: Vni, +} + +impl NextHopV6 { + pub fn new(addr: Ipv6Addr, vni: Vni) -> Self { + Self { addr, vni } + } +} + /// A Geneve tunnel endpoint. #[derive(Clone, Copy, Debug, Deserialize, Serialize)] pub struct TunnelEndpoint { @@ -432,7 +537,7 @@ impl Display for RouterTarget { pub enum RouterClass { /// The rule belongs to the shared VPC-wide router. System, - /// The rule belongs to the subnet-specific router, and has precendence + /// The rule belongs to the subnet-specific router, and has precedence /// over a `System` rule of equal priority. Custom, } @@ -565,6 +670,38 @@ pub struct ClearVirt2PhysReq { pub phys: PhysNet, } +/// Set mapping from (overlay) multicast group to underlay multicast address. +/// +/// Creates a multicast group fleet-wide by mapping an overlay multicast address +/// to an underlay IPv6 multicast address. Ports can then join via `subscribe()`. +/// The M2P mapping is the source of truth - if it exists, the group exists. +/// +/// Ports join and leave with `subscribe()` and `unsubscribe()`, which look up +/// the underlay address via this M2P mapping. Without the mapping, `subscribe()` +/// fails (can't look up underlay), but `unsubscribe()` succeeds +/// (group gone => not subscribed). +/// +/// This handles cleanup races where the control plane deletes the group before +/// sleds finish unsubscribing ports. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + +/// Clear a mapping from multicast group to underlay multicast address. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetVirt2BoundaryReq { @@ -605,8 +742,101 @@ pub enum DelRouterEntryResp { NotFound, } +/// Set multicast forwarding entries for an underlay multicast group. +/// +/// Configures how OPTE forwards multicast packets for a specific underlay group. +/// The forwarding table maps underlay multicast addresses to switch endpoints +/// and Tx-only replication instructions. +/// +/// Routing vs destination: OPTE routes to [`NextHopV6::addr`] (switch's unicast +/// address) to determine reachability and which underlay port/MAC to use. The +/// packet is sent to the multicast address (`underlay`) with multicast MAC. The +/// switch uses the multicast destination and Geneve [`Replication`] tag +/// to determine which port groups to replicate to on transmission. +/// +/// Fleet-wide multicast: All multicast uses DEFAULT_MULTICAST_VNI (77) +/// currently. The VNI in NextHopV6 must be 77 - other values are rejected. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcastForwardingReq { + /// The underlay IPv6 multicast address (outer IPv6 dst in transmitted packets) + /// Must be admin-scoped ff04::/16 + pub underlay: MulticastUnderlay, + /// Switch endpoints and Tx-only replication instructions. + /// Each NextHopV6.addr is the unicast IPv6 of a switch (for routing). + /// The Replication is a Tx-only instruction indicating which port groups + /// the switch should use. + pub next_hops: Vec<(NextHopV6, Replication)>, +} + +/// Clear multicast forwarding entries for an underlay multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcastForwardingReq { + /// The underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + +/// Response for dumping the multicast forwarding table. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastForwardingResp { + /// The multicast forwarding table entries + pub entries: Vec, +} + +impl CmdOk for DumpMcastForwardingResp {} + +/// A single multicast forwarding table entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastForwardingEntry { + /// The underlay IPv6 multicast address (admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, + /// The next hops (underlay IPv6 addresses) with Tx-only replication instructions + pub next_hops: Vec<(NextHopV6, Replication)>, +} + impl opte::api::cmd::CmdOk for DelRouterEntryResp {} +/// Response for dumping the multicast subscription table (group -> ports). +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastSubscriptionsResp { + pub entries: Vec, +} + +impl CmdOk for DumpMcastSubscriptionsResp {} + +/// A single multicast subscription entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscriptionEntry { + /// The underlay IPv6 multicast address (admin-scoped ff04::/16, subscription key) + pub underlay: MulticastUnderlay, + /// Port names subscribed to this group on this sled + pub ports: Vec, +} + +/// Subscribe a port to a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscribeReq { + /// The port name to subscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe a port from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeReq { + /// The port name to unsubscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe all ports from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeAllReq { + /// The multicast group address + pub group: IpAddr, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index eb2c3b44..b3ad7d4a 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -39,6 +39,21 @@ //! # Link-Local IPv6 //! //! No IPv6 link-local traffic should ever make it past this layer. +//! +//! # Multicast Traffic +//! +//! The gateway layer allows both unicast and multicast traffic through +//! the no-spoof rules (outbound) and separate inbound rules: +//! +//! - Outbound: The no-spoof rule matches on source IP/MAC but has no +//! destination IP predicate, so it permits multicast destinations. This +//! allows guests to send to any multicast group address at the gateway +//! layer. However, the overlay layer enforces M2P (Multicast-to-Physical) +//! mappings, denying packets for unconfigured multicast groups. +//! +//! - Inbound: Separate rules (IPv4 224.0.0.0/4 and IPv6 ff00::/8) +//! allow multicast packets to reach guests and rewrite the source MAC +//! to the gateway MAC, similar to unicast traffic. use crate::api::DhcpCfg; use crate::api::MacAddr; @@ -56,6 +71,8 @@ use opte::api::Direction; use opte::api::OpteError; use opte::engine::ether::EtherMod; use opte::engine::headers::HeaderAction; +use opte::engine::ip::v4::Ipv4Cidr; +use opte::engine::ip::v6::Ipv6Cidr; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; @@ -173,6 +190,16 @@ fn setup_ipv4( let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), @@ -196,6 +223,22 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Inbound IPv4 multicast - rewrite source MAC to gateway and allow + let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; + // This mirrors the IPv6 multicast inbound rule to ensure multicast + // delivery to guests is permitted by the gateway layer. + let mut mcast_in_v4 = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); + mcast_in_v4.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); + layer.add_rule(Direction::In, mcast_in_v4.finalize()); + Ok(()) } @@ -209,6 +252,17 @@ fn setup_ipv6( icmpv6::setup(layer, cfg, ip_cfg)?; dhcpv6::setup(layer, cfg, dhcp_cfg)?; let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); + + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), @@ -232,6 +286,20 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Inbound IPv6 multicast - rewrite source MAC to gateway and allow + let ipv6_mcast = vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]; + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp6(ipv6_mcast)); + mcast_in.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index f22ed8c6..1136095a 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -5,7 +5,80 @@ // Copyright 2025 Oxide Computer Company //! Geneve option types specific to the Oxide VPC dataplane. - +//! +//! # Oxide Geneve Options +//! +//! This module defines Geneve options used in the Oxide rack network to carry +//! VPC-specific metadata during packet encapsulation. All options use the Oxide +//! option class (`GENEVE_OPT_CLASS_OXIDE` = 0x0129). +//! +//! ## Option Types +//! +//! - **External** (0x00): Indicates a packet originated from outside the rack +//! and was encapsulated by the switch NAT ingress path with Geneve wrapping. +//! - **Multicast** (0x01): Carries multicast replication strategy as a 2-bit +//! field for coordinating delivery between OPTE and sidecar switch logic. +//! - **Mss** (0x02): Carries original TCP MSS for MSS clamping/boosting to +//! prevent MTU issues during underlay encapsulation. +//! +//! ## Multicast Option Encoding +//! +//! The multicast option uses a compact 2-bit encoding aligned with sidecar.p4's +//! processing constraints: +//! +//! ```text +//! Option body (4 bytes): +//! ┌──────────┬────────────────────────────┐ +//! │ Bits 7-6 │ Bits 5-0 + remaining bytes │ +//! │ (u2) │ (reserved, must be 0) │ +//! └──────────┴────────────────────────────┘ +//! │ +//! └─> Replication mode: +//! 00 = External (front panel/customer ports, traffic leaving rack) +//! 01 = Underlay (infrastructure forwarding to other sleds) +//! 10 = Both (both External and Underlay) +//! 11 = Reserved +//! ``` +//! +//! ### Replication Semantics (Tx-only instruction) +//! +//! The [`Replication`] type is a Tx-only instruction telling the switch which +//! port groups to replicate outbound multicast packets to. On Rx, OPTE ignores +//! the replication field and performs local same-sled delivery based purely on +//! subscriptions. +//! +//! OPTE routes to next hop unicast address (for ALL modes) to determine +//! reachability and underlay port/MAC. Packet destination is multicast +//! ff04::/16 with multicast MAC. +//! +//! - **External**: Switch decaps and replicates to external-facing ports (front panel) +//! - **Underlay**: Switch replicates to underlay ports (other sleds) +//! - **Both**: Switch replicates to both external and underlay port groups (bifurcated) +//! - **Local same-sled delivery**: Always happens regardless of the replication setting. +//! Not an access control mechanism - local delivery is independent of replication mode. +//! +//! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) +//! regardless of replication mode. The replication mode determines delivery behavior, +//! not VNI selection. +//! +//! The 2-bit encoding allows extraction in P4 programs and aligns with the +//! sidecar pipeline's tag-based routing decisions. +//! +//! [`Replication`]: crate::api::Replication +//! +//! ## Option Length Encoding +//! +//! Geneve has two length fields to consider (both measured in 4-byte words): +//! - Geneve header `opt_len` (6 bits): total size of the options area +//! (sums each option's 4-byte header + body). +//! - Option header `len` (5 bits): size of that option's body only. +//! +//! For Oxide options used here: +//! - External: geneve opt_len += 1; option len = 0 +//! - Multicast: geneve opt_len += 2; option len = 1 +//! - MSS: geneve opt_len += 2; option len = 1 + +use crate::api::Replication; use ingot::geneve::GeneveFlags; use ingot::geneve::GeneveRef; use ingot::geneve::ValidGeneve; @@ -84,6 +157,7 @@ impl<'a> OptionCast<'a> for ValidOxideOption<'a> { } } +/// Geneve multicast option body carrying replication information. #[derive(Debug, Clone, Ingot, Eq, PartialEq)] #[ingot(impl_default)] pub struct MulticastInfo { @@ -92,20 +166,6 @@ pub struct MulticastInfo { rsvd: u30be, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash)] -#[repr(u8)] -pub enum Replication { - /// Replicate packets to ports set for external multicast traffic. - #[default] - External = 0x00, - /// Replicate packets to ports set for underlay multicast traffic. - Underlay, - /// Replicate packets to ports set for underlay and external multicast - /// traffic (bifurcated). - All, - Reserved, -} - impl NetworkRepr for Replication { fn to_network(self) -> u2 { self as u8 @@ -116,9 +176,9 @@ impl NetworkRepr for Replication { match val { 0 => Replication::External, 1 => Replication::Underlay, - 2 => Replication::All, + 2 => Replication::Both, 3 => Replication::Reserved, - _ => panic!("outside bounds of u2"), + _ => unreachable!("u2 value out of range: {val}"), } } } @@ -157,6 +217,40 @@ pub fn validate_options( Ok(()) } +/// Extract multicast replication info from Geneve options. +/// +/// Treats Reserved (value 3) as invalid and returns None, implementing +/// fail-closed behavior. +/// +/// This function silently skips options with parse errors (e.g., `TooSmall`). +/// Call `validate_options()` first if you want parse errors surfaced and +/// RFC 8926 critical option semantics enforced. This function assumes +/// validation has already been performed. +pub fn extract_multicast_replication( + pkt: &ValidGeneve, +) -> Option { + // In debug builds, verify validate_options() was called first if critical options present + debug_assert!( + !pkt.flags().contains(GeneveFlags::CRITICAL_OPTS) + || validate_options(pkt).is_ok(), + "extract_multicast_replication() called without prior validation when critical options present" + ); + + for opt in OxideOptions::from_raw(pkt) { + let Ok(opt) = opt else { continue }; + if let Some(ValidOxideOption::Multicast(mc_info)) = opt.option.known() { + let repl = mc_info.version(); + // Filter out Reserved (u2=3). This value exists in the 2-bit space + // but is not used by sidecar P4; treat as invalid. + if matches!(repl, Replication::Reserved) { + return None; + } + return Some(repl); + } + } + None +} + #[cfg(test)] pub fn valid_geneve_has_oxide_external( pkt: &ValidGeneve, @@ -177,9 +271,14 @@ pub fn valid_geneve_has_oxide_external( #[cfg(test)] mod test { use super::*; + use alloc::vec::Vec; use ingot::types::HeaderParse; use ingot::udp::ValidUdp; + /// Critical bit mask for Geneve option type field (bit 7). + /// Per RFC 8926, unknown options with this bit set must cause packet drop. + const GENEVE_OPT_TYPE_CRITICAL: u8 = 0x80; + #[test] fn parse_single_opt() { // Create a packet with one extension header. @@ -201,7 +300,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type @@ -219,6 +317,57 @@ mod test { assert!(valid_geneve_has_oxide_external(&geneve)); } + #[test] + fn parse_multicast_replication_values() { + // Build a minimal UDP+Geneve packet with one Oxide multicast option + // Body's first byte top-2 bits carry Replication. + fn build_buf(rep: Replication) -> Vec { + #[rustfmt::skip] + let mut buf = vec![ + // UDP source + 0x1E, 0x61, + // UDP dest + 0x17, 0xC1, + // UDP length (8 UDP hdr + 8 Geneve hdr + 4 opt hdr + 4 opt body = 24 = 0x18) + 0x00, 0x18, + // UDP csum + 0x00, 0x00, + // Geneve: ver + opt len (2 words = 8 bytes: 4 opt hdr + 4 opt body) + 0x02, + // Geneve flags + 0x00, + // Geneve proto + 0x65, 0x58, + // Geneve vni + reserved + 0x00, 0x00, 0x00, 0x00, + // Geneve option: class 0x0129 (Oxide) + 0x01, 0x29, + // Geneve option: flags+type (non-critical, Multicast = 0x01) + 0x01, + // Geneve option: rsvd + len (1 word = 4 bytes body) + 0x01, + ]; + // Geneve option body: 4-byte body with replication in top 2 bits + buf.push((rep as u8) << 6); + buf.extend_from_slice(&[0x00, 0x00, 0x00]); + buf + } + + for (rep, expect) in [ + (Replication::External, Replication::External), + (Replication::Underlay, Replication::Underlay), + (Replication::Both, Replication::Both), + ] { + let buf = build_buf(rep); + let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); + let (geneve, ..) = ValidGeneve::parse(rem).unwrap(); + validate_options(&geneve).unwrap(); + + let got = extract_multicast_replication(&geneve).unwrap(); + assert_eq!(got, expect); + } + } + #[test] fn unknown_crit_option_fails() { // Create a packet with one extension header with the critical @@ -242,11 +391,10 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0xff, 0xff, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; @@ -281,11 +429,10 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0x01, 0x29, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; @@ -314,8 +461,8 @@ mod test { 0x1E, 0x61, // dest 0x17, 0xC1, - // length - 0x00, 0x1c, + // length (8 UDP hdr + 8 Geneve hdr + 20 options = 36 = 0x24) + 0x00, 0x24, // csum 0x00, 0x00, // ver + opt len @@ -326,14 +473,12 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type 0x00, // rsvd + len 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -342,7 +487,6 @@ mod test { 0x01, // body 0x00, 0x00, 0x00, 0x00, - // experimenter option class 0xff, 0xff, // crt + type diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 5149416a..a2c8175d 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -9,10 +9,12 @@ //! This implements the Oxide Network VPC Overlay. use super::geneve::OxideOptions; use super::router::RouterTargetInternal; +use crate::api::DEFAULT_MULTICAST_VNI; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; use crate::api::PhysNet; +use crate::api::Replication; use crate::api::TunnelEndpoint; use crate::api::V2bMapResp; use crate::api::VpcMapResp; @@ -30,6 +32,7 @@ use opte::api::Direction; use opte::api::Ipv4Addr; use opte::api::Ipv4Cidr; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::OpteError; use opte::ddi::sync::KMutex; use opte::ddi::sync::KMutexGuard; @@ -69,6 +72,8 @@ use opte::engine::rule::GenHtError; use opte::engine::rule::GenHtResult; use opte::engine::rule::HdrTransform; use opte::engine::rule::MappingResource; +use opte::engine::rule::MetaAction; +use opte::engine::rule::ModMetaResult; use opte::engine::rule::Resource; use opte::engine::rule::ResourceEntry; use opte::engine::rule::Rule; @@ -81,6 +86,7 @@ pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, v2p: Arc, + m2p: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -89,24 +95,38 @@ pub fn setup( cfg.phys_ip, cfg.vni, v2p, + m2p, v2b, ))); // Action Index 1 let decap = Action::Static(Arc::new(DecapAction::new())); + // Action Index 2 - Multicast VNI validator + let vni_validator = + Action::Meta(Arc::new(MulticastVniValidator::new(cfg.vni))); + let actions = LayerActions { - actions: vec![encap, decap], + actions: vec![encap, decap, vni_validator], default_in: DefaultAction::Deny, default_out: DefaultAction::Deny, }; let mut layer = Layer::new(OVERLAY_LAYER_NAME, pb.name(), actions, ft_limit); + + // Outbound: encapsulation (priority 1) let encap_rule = Rule::match_any(1, layer.action(0).unwrap()); layer.add_rule(Direction::Out, encap_rule); + + // Inbound: decapsulation (priority 1 - runs first, sets ACTION_META_VNI) let decap_rule = Rule::match_any(1, layer.action(1).unwrap()); layer.add_rule(Direction::In, decap_rule); + + // Inbound: VNI validation (priority 2 - runs after decap) + let vni_check_rule = Rule::match_any(2, layer.action(2).unwrap()); + layer.add_rule(Direction::In, vni_check_rule); + // NOTE The First/Last positions cannot fail; perhaps I should // improve the API to avoid the unwrap(). pb.add_layer(layer, Pos::Last) @@ -183,6 +203,7 @@ pub struct EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, v2p: Arc, + m2p: Arc, v2b: Arc, } @@ -191,9 +212,10 @@ impl EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, v2p: Arc, + m2p: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, v2p, v2b } + Self { phys_ip_src, vni, v2p, m2p, v2b } } } @@ -213,110 +235,144 @@ impl StaticAction for EncapAction { action_meta: &mut ActionMeta, ) -> GenHtResult { let f_hash = flow_id.crc32(); - - // The router layer determines a RouterTarget and stores it in - // the meta map. We need to map this virtual target to a - // physical one. - let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) { - Some(val) => val, - None => { - // This should never happen. The router should always - // write an entry. However, we currently have no way - // to enforce this in the type system, and thus must - // account for this situation. - return Err(GenHtError::Unexpected { - msg: "no RouterTarget metadata entry found".to_string(), - }); + let dst_ip = flow_id.dst_ip(); + + // Multicast traffic is detected by checking if the inner + // destination IP is a multicast address. Multicast operates at the fleet + // level (cross-VPC) and doesn't go through VPC routing, so router + // metadata is not required in that case. + let is_mcast_addr = dst_ip.is_multicast(); + + let (is_internal, phys_target, is_mcast) = if is_mcast_addr { + // Multicast traffic: use M2P mapping to get the multicast underlay address. + // Fleet-level multicast mappings are stored in the dedicated `m2p`. + match self.m2p.get(&dst_ip) { + Some(underlay) => ( + true, + PhysNet { + // Outer MAC filled in by XDE + ether: MacAddr::ZERO, + ip: underlay.addr(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + }, + true, + ), + None => { + // No M2P mapping configured for this multicast group; deny. + return Ok(AllowOrDeny::Deny); + } } - }; + } else { + // Non-multicast traffic: process through router target. + + // The router layer determines a RouterTarget and stores it in + // the meta map. We need to map this virtual target to a + // physical one. + let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) + { + Some(val) => val, + None => { + return Err(GenHtError::Unexpected { + msg: "no RouterTarget metadata entry found".to_string(), + }); + } + }; - let target = match RouterTargetInternal::from_meta(target_str) { - Ok(val) => val, - Err(e) => { - return Err(GenHtError::Unexpected { + let target = RouterTargetInternal::from_meta(target_str).map_err( + |e| GenHtError::Unexpected { msg: format!( "failed to parse metadata entry '{target_str}': {e}", ), - }); - } - }; + }, + )?; + + match target { + RouterTargetInternal::InternetGateway(_) => { + match self.v2b.get(&dst_ip) { + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = f_hash as usize; + let target = + match phys.iter().nth(hash % phys.len()) { + Some(target) => target, + None => return Ok(AllowOrDeny::Deny), + }; + ( + false, + PhysNet { + ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), + ip: target.ip, + vni: target.vni, + }, + false, + ) + } + None => return Ok(AllowOrDeny::Deny), + } + } - let (is_internal, phys_target) = match target { - RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&flow_id.dst_ip()) { - Some(phys) => { - // Hash the packet onto a route target. This is a very - // rudimentary mechanism. Should level-up to an ECMP - // algorithm with well known statistical properties. - let hash = f_hash as usize; - let target = match phys.iter().nth(hash % phys.len()) { - Some(target) => target, - None => return Ok(AllowOrDeny::Deny), - }; - ( - false, + RouterTargetInternal::Ip(virt_ip) => { + match self.v2p.get(&virt_ip) { + Some(phys) => ( + true, PhysNet { - ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), - ip: target.ip, - vni: target.vni, + ether: phys.ether, + ip: phys.ip, + vni: self.vni, }, - ) + false, + ), + + // The router target has specified a VPC IP we do not + // currently know about; this could be for two + // reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's VPC, + // but we do not yet have a mapping for it. + // + // We cannot differentiate these cases from the point + // of view of this code without more information from + // the control plane; rather we drop the packet. If we + // are dealing with scenario (2), the control plane + // should eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), } - None => return Ok(AllowOrDeny::Deny), } - } - - RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { - Some(phys) => ( - true, - PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni }, - ), - - // The router target has specified a VPC IP we do not - // currently know about; this could be for two - // reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's VPC, - // but we do not yet have a mapping for it. - // - // We cannot differentiate these cases from the point - // of view of this code without more information from - // the control plane; rather we drop the packet. If we - // are dealing with scenario (2), the control plane - // should eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), - }, - - RouterTargetInternal::VpcSubnet(_) => { - match self.v2p.get(&flow_id.dst_ip()) { - Some(phys) => ( - true, - PhysNet { - ether: phys.ether, - ip: phys.ip, - vni: self.vni, - }, - ), - // The guest is attempting to contact a VPC IP we - // do not currently know about; this could be for - // two reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's - // VPC, but we do not yet have a mapping for - // it. - // - // We cannot differentiate these cases from the - // point of view of this code without more - // information from the control plane; rather we - // drop the packet. If we are dealing with - // scenario (2), the control plane should - // eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), + RouterTargetInternal::VpcSubnet(_) => { + match self.v2p.get(&flow_id.dst_ip()) { + Some(phys) => ( + true, + PhysNet { + ether: phys.ether, + ip: phys.ip, + vni: self.vni, + }, + false, + ), + + // The guest is attempting to contact a VPC IP we + // do not currently know about; this could be for + // two reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's + // VPC, but we do not yet have a mapping for + // it. + // + // We cannot differentiate these cases from the + // point of view of this code without more + // information from the control plane; rather we + // drop the packet. If we are dealing with + // scenario (2), the control plane should + // eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), + } } } }; @@ -330,25 +386,61 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; + // For multicast originated from this host, we seed the multicast Geneve + // option with `External` replication. XDE will then select the actual + // replication per next hop based on the rack-wide forwarding table + // (mcast_fwd), which tells the switch which ports to replicate to + // (external, underlay, or bifurcated). + // + // Local same-sled delivery to subscribed guests is always performed by + // OPTE, independent of the replication mode (not an access control mechanism). + // + // The first byte encodes Replication in the top 2 bits: + // External=0x00, Underlay=0x40, Both=0x80, Reserved=0xC0 + const REPLICATION_EXTERNAL_BYTE: u8 = + (Replication::External as u8) << 6; + static GENEVE_MCAST_OPT_BODY: &[u8] = &[ + REPLICATION_EXTERNAL_BYTE, // Top 2 bits encode replication strategy + 0x00, + 0x00, + 0x00, // Reserved bytes + ]; + static GENEVE_MCAST_OPT: ArbitraryGeneveOption = + ArbitraryGeneveOption { + option_class: GENEVE_OPT_CLASS_OXIDE, + option_type: OxideOptionType::Multicast as u8, + data: Cow::Borrowed(GENEVE_MCAST_OPT_BODY), + }; + + // For multicast, derive the outer MAC from the IPv6 address per RFC 2464. + // For unicast, XDE fills in the MAC via routing table lookup. + let outer_mac = if is_mcast { + phys_target.ip.unchecked_multicast_mac() + } else { + MacAddr::ZERO + }; + let tfrm = HdrTransform { name: ENCAP_NAME.to_string(), // We leave the outer src/dst up to the driver. + // In the multicast case we can, however, derive this. outer_ether: HeaderAction::Push( Valid::validated(EtherMeta { + dst: outer_mac, src: MacAddr::ZERO, - dst: MacAddr::ZERO, ether_type: EtherType::Ipv6, }) .expect("Ethernet validation is infallible"), ), - outer_ip: HeaderAction::Push(Valid::validated(IpPush::from( - Ipv6Push { + outer_ip: HeaderAction::Push({ + let ip_push = IpPush::from(Ipv6Push { src: self.phys_ip_src, dst: phys_target.ip, proto: Protocol::UDP, exts: Cow::Borrowed(&[]), - }, - ))?), + }); + Valid::validated(ip_push)? + }), // XXX Geneve uses the UDP source port as a flow label // value for the purposes of ECMP -- a hash of the // 5-tuple. However, when using Geneve in IPv6 one could @@ -369,30 +461,47 @@ impl StaticAction for EncapAction { EncapPush::from(GenevePush { vni: phys_target.vni, entropy: flow_id.crc32() as u16, - // Allocate space in which we can include the TCP MSS, when - // needed during MSS boosting. It's theoretically doable to - // gate this on seeing an unexpectedly high/low MSS option - // in the TCP handshake, but there are problems in doing so: - // * The MSS for the flow is negotiated, but the UFT entry - // containing this transform does not know the other side. - // * UFT invalidation means we may rerun this transform in - // the middle of a flow. - // So, emit it unconditionally for VPC-internal TCP traffic, - // which could need the original MSS to be carried when LSO - // is in use. - options: if pkt_meta.is_inner_tcp() && is_internal { - Cow::Borrowed(core::slice::from_ref( + options: match ( + pkt_meta.is_inner_tcp() && is_internal, + is_mcast, + ) { + // Allocate space in which we can include the TCP MSS, when + // needed during MSS boosting. It's theoretically doable to + // gate this on seeing an unexpectedly high/low MSS option + // in the TCP handshake, but there are problems in doing so: + // * The MSS for the flow is negotiated, but the UFT entry + // containing this transform does not know the other side. + // * UFT invalidation means we may rerun this transform in + // the middle of a flow. + // So, emit it unconditionally for VPC-internal TCP traffic, + // which could need the original MSS to be carried when LSO + // is in use. + (true, false) => Cow::Borrowed(core::slice::from_ref( &GENEVE_MSS_SIZE_OPT, - )) - } else { - Cow::Borrowed(&[]) + )), + (false, true) => Cow::Borrowed(core::slice::from_ref( + &GENEVE_MCAST_OPT, + )), + (false, false) => Cow::Borrowed(&[]), + // We do not support TCP over multicast delivery. + // Multicast replication semantics conflict with TCP's + // connection/ordering guarantees, so deny this case. + (true, true) => { + return Ok(AllowOrDeny::Deny); + } }, }), )?), - inner_ether: HeaderAction::Modify(EtherMod { - dst: Some(phys_target.ether), - ..Default::default() - }), + // For multicast packets, the inner destination MAC should already + // correspond to the inner L3 destination address. + inner_ether: if is_mcast { + HeaderAction::Ignore + } else { + HeaderAction::Modify(EtherMod { + dst: Some(phys_target.ether), + ..Default::default() + }) + }, ..Default::default() }; @@ -482,11 +591,79 @@ impl StaticAction for DecapAction { } } +/// Validate VNI for inbound multicast traffic in the overlay layer. +/// +/// All outbound multicast packets are currently encapsulated with VNI 77 +/// (DEFAULT_MULTICAST_VNI) for fleet-wide delivery. See [`EncapAction::gen_ht`]. +/// +/// ## Validation Policy on Rx Path +/// This validator accepts multicast packets with either of two VNI values: +/// - **VNI 77 (DEFAULT_MULTICAST_VNI)**: Fleet-wide multicast, accepted by all +/// ports regardless of VPC. This enables rack-wide multicast delivery. +/// - **Guest's VPC VNI**: Enables per-VPC multicast isolation **in the future**. +/// +/// The validator enforces VPC isolation by rejecting multicast packets with +/// VNI values that don't match either the fleet-wide VNI or this port's VPC. +struct MulticastVniValidator { + my_vni: Vni, +} + +impl MulticastVniValidator { + fn new(vni: Vni) -> Self { + Self { my_vni: vni } + } +} + +impl MetaAction for MulticastVniValidator { + fn mod_meta( + &self, + flow: &InnerFlowId, + action_meta: &mut ActionMeta, + ) -> ModMetaResult { + // Only validate if this is multicast traffic + if !flow.dst_ip().is_multicast() { + return Ok(AllowOrDeny::Allow(())); + } + + // Check VNI from action metadata (set by DecapAction) + if let Some(vni_str) = action_meta.get(ACTION_META_VNI) + && let Ok(vni_val) = vni_str.parse::() + && let Ok(pkt_vni) = Vni::new(vni_val) + { + let mcast_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + // Allow if VNI matches this VPC or fleet-wide multicast VNI + if pkt_vni == self.my_vni || pkt_vni == mcast_vni { + return Ok(AllowOrDeny::Allow(())); + } + // VNI mismatch or parse error - deny + return Ok(AllowOrDeny::Deny); + } + // No VNI in metadata means external packet - allow + // (external packets don't have ACTION_META_VNI set per DecapAction logic) + Ok(AllowOrDeny::Allow(())) + } + + fn implicit_preds(&self) -> (Vec, Vec) { + (vec![], vec![]) + } +} + +impl fmt::Display for MulticastVniValidator { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "mcast-vni-validator") + } +} + pub struct VpcMappings { inner: KMutex>>, } impl VpcMappings { + /// Generate a new mapping struct. + pub fn new() -> Self { + Self { inner: KMutex::new(BTreeMap::new()) } + } + /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { @@ -495,19 +672,10 @@ impl VpcMappings { let guest_phys = GuestPhysAddr::from(phys); let mut lock = self.inner.lock(); - match lock.get(&phys.vni) { - Some(v2p) => { - v2p.set(vip, guest_phys); - v2p.clone() - } + let v2p = lock.entry(phys.vni).or_default(); + v2p.set(vip, guest_phys); - None => { - let v2p = Arc::new(Virt2Phys::new()); - v2p.set(vip, guest_phys); - lock.insert(phys.vni, v2p.clone()); - v2p - } - } + v2p.clone() } /// Delete the mapping for the given VIP in the given VNI. @@ -556,10 +724,6 @@ impl VpcMappings { None } - - pub fn new() -> Self { - VpcMappings { inner: KMutex::new(BTreeMap::new()) } - } } impl Default for VpcMappings { @@ -568,6 +732,10 @@ impl Default for VpcMappings { } } +// XXX: Should these not be RwLocks? This is a really unfortunate degree of +// contention for multiple ports in the slowpath to block one another. +// (Not common by any means, but needless when it does occur!) + /// A mapping from virtual IPs to physical location. pub struct Virt2Phys { // XXX We need to implement some sort of invalidation mechanism @@ -606,6 +774,15 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } +/// A mapping from inner multicast destination IPs to underlay multicast groups. +/// +/// Validation is enforced at the API boundary (see xde.rs set_m2p_hdlr) to ensure +/// only valid admin-local IPv6 multicast addresses (ff04::/16) are stored. +pub struct Mcast2Phys { + ip4: KMutex>, + ip6: KMutex>, +} + pub const TUNNEL_ENDPOINT_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { @@ -828,3 +1005,65 @@ impl MappingResource for Virt2Phys { } } } + +impl Mcast2Phys { + /// Create a new empty multicast-to-physical mapping table. + pub fn new() -> Self { + Self { + ip4: KMutex::new(BTreeMap::new()), + ip6: KMutex::new(BTreeMap::new()), + } + } + + /// Dump all IPv4 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { + self.ip4 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() + } + + /// Dump all IPv6 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { + self.ip6 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() + } +} + +impl Default for Mcast2Phys { + fn default() -> Self { + Self::new() + } +} + +impl Resource for Mcast2Phys {} + +impl MappingResource for Mcast2Phys { + type Key = IpAddr; + type Entry = MulticastUnderlay; + + fn get(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().get(ip4).cloned(), + IpAddr::Ip6(ip6) => self.ip6.lock().get(ip6).cloned(), + } + } + + fn remove(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().remove(ip4), + IpAddr::Ip6(ip6) => self.ip6.lock().remove(ip6), + } + } + + fn set(&self, vip: Self::Key, mcast: Self::Entry) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().insert(ip4, mcast), + IpAddr::Ip6(ip6) => self.ip6.lock().insert(ip6, mcast), + } + } +} diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index cabe96e5..6f03f892 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -267,7 +267,19 @@ pub fn setup( default_out: DefaultAction::Deny, }; - let layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + let mut layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + + // Allow multicast traffic (IPv4 224.0.0.0/4 and IPv6 ff00::/8) to bypass route lookup. + // Multicast operates fleet-wide via M2P mappings, not through VPC routing. + // The overlay addresses use any valid multicast prefix; underlay restriction + // to ff04::/16 is enforced by M2P mapping validation. + let mut mcast_out = Rule::new(0, Action::Allow); + mcast_out.add_predicate(Predicate::Any(vec![ + Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]), + Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + pb.add_layer(layer, Pos::After(fw::FW_LAYER_NAME)) } @@ -294,6 +306,22 @@ fn make_rule( target: RouterTarget, class: RouterClass, ) -> Result, OpteError> { + // Reject router entries with multicast destination CIDRs. + // Multicast operates fleet-wide via M2P mappings and subscriptions, + // not through VPC routing. Router layer allows multicast through + // unconditionally without route lookup. + let is_mcast_dst = match dest { + IpCidr::Ip4(cidr) => cidr.ip().is_multicast(), + IpCidr::Ip6(cidr) => cidr.ip().is_multicast(), + }; + if is_mcast_dst { + return Err(OpteError::InvalidRouterEntry { + dest, + target: "multicast destinations not allowed in router entries" + .to_string(), + }); + } + if !valid_router_dest_target_pair(&dest, &target) { return Err(OpteError::InvalidRouterEntry { dest, diff --git a/lib/oxide-vpc/src/print.rs b/lib/oxide-vpc/src/print.rs index c6a46ef3..5a014702 100644 --- a/lib/oxide-vpc/src/print.rs +++ b/lib/oxide-vpc/src/print.rs @@ -9,6 +9,8 @@ //! This is mostly just a place to hang printing routines so that they //! can be used by both opteadm and integration tests. +use crate::api::DumpMcastForwardingResp; +use crate::api::DumpMcastSubscriptionsResp; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; @@ -135,3 +137,70 @@ fn print_v2p_ip6( std::net::Ipv6Addr::from(phys.ip.bytes()), ) } + +/// Print the header for the [`print_mcast_fwd()`] output. +fn print_mcast_fwd_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "GROUP IP\tUNDERLAY IP\tVNI\tREPLICATION") +} + +/// Print a [`DumpMcastForwardingResp`]. +pub fn print_mcast_fwd(resp: &DumpMcastForwardingResp) -> std::io::Result<()> { + print_mcast_fwd_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastForwardingResp`] into a given writer. +pub fn print_mcast_fwd_into( + writer: &mut impl Write, + resp: &DumpMcastForwardingResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Forwarding Table")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_fwd_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + for (next_hop, replication) in &entry.next_hops { + writeln!( + t, + "{}\t{}\t{}\t{replication:?}", + entry.underlay, next_hop.addr, next_hop.vni + )?; + } + } + writeln!(t)?; + t.flush() +} + +/// Print the header for the [`print_mcast_subs()`] output. +fn print_mcast_subs_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "UNDERLAY GROUP\tSUBSCRIBED PORTS") +} + +/// Print a [`DumpMcastSubscriptionsResp`]. +pub fn print_mcast_subs( + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + print_mcast_subs_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastSubscriptionsResp`] into a given writer. +pub fn print_mcast_subs_into( + writer: &mut impl Write, + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Subscriptions")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_subs_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + let ports = entry.ports.join(", "); + writeln!(t, "{}\t{ports}", entry.underlay)?; + } + writeln!(t)?; + t.flush() +} diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index fe3454d6..4c1a8e66 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -36,6 +36,7 @@ use opte::engine::ip::v4::Ipv4Addr; use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v4::ValidIpv4; use opte::engine::ip::v6::Ipv6; +use opte::engine::ip::v6::Ipv6Addr; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::ip::v6::ValidIpv6; use opte::engine::packet::InnerFlowId; @@ -43,10 +44,15 @@ use opte::engine::packet::MblkFullParsed; use opte::engine::packet::MismatchError; use opte::engine::packet::Packet; use opte::engine::parse::ValidUlp; +use opte::engine::port::DropReason; use opte::engine::port::ProcessError; +use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; +use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; use opte::ingot::icmp::IcmpV6Ref; +use opte::ingot::ip::IpProtocol; use opte::ingot::tcp::TcpRef; use opte::ingot::types::Emit; use opte::ingot::types::HeaderLen; @@ -59,6 +65,7 @@ use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; use oxide_vpc::api::VpcCfg; +use oxide_vpc::engine::geneve; use pcap::*; use smoltcp::phy::ChecksumCapabilities as CsumCapab; use smoltcp::wire::Icmpv4Packet; @@ -492,7 +499,7 @@ fn guest_to_guest_no_route() { g1.vpc_map.add(g2_cfg.ipv4().private_ip.into(), g2_cfg.phys_addr()); g1.port.start(); set!(g1, "port_state=running"); - // Make sure the router is configured to drop all packets. + // Make sure the router is configured to drop all packets except multicast. router::del_entry( &g1.port, IpCidr::Ip4(g1_cfg.ipv4().vpc_subnet), @@ -500,7 +507,7 @@ fn guest_to_guest_no_route() { RouterClass::System, ) .unwrap(); - update!(g1, ["incr:epoch", "set:router.rules.out=0"]); + update!(g1, ["incr:epoch", "set:router.rules.out=1"]); let mut pkt1_m = http_syn(&g1_cfg, &g2_cfg); let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt1); @@ -2537,8 +2544,8 @@ fn test_gateway_neighbor_advert_reply() { .unwrap_or_else(|| String::from("Drop")); panic!( "Generated unexpected packet from NS: {}\n\ - Result: {:?}\nExpected: {}", - d.ns, res, na, + Result: {res:?}\nExpected: {na}", + d.ns ); } }; @@ -4678,7 +4685,7 @@ fn icmp_inner_has_nat_applied() { header: smoltcp::wire::Ipv4Repr { src_addr: remote_addr.into(), dst_addr: g1_cfg.ipv4().private_ip.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, payload_len: 256, hop_limit: 0, }, @@ -4747,7 +4754,7 @@ fn icmpv6_inner_has_nat_applied() { header: smoltcp::wire::Ipv6Repr { src_addr: eph_ip.into(), dst_addr: remote_addr.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, // Unimportant -- header is truncated. payload_len: 256, hop_limit: 255, @@ -4811,3 +4818,328 @@ fn icmpv6_inner_has_nat_applied() { let (v6, ..) = ValidIpv6::parse(body).unwrap(); assert_eq!(v6.source(), g1_cfg.ipv6().private_ip); } + +// Test that IPv6 multicast packets get encapsulated with Geneve +#[test] +fn test_ipv6_multicast_encapsulation() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast packet (ff04::1:3 - admin-local multicast) + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Create a multicast underlay address (must be multicast for forwarding) + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Add multicast forwarding entry BEFORE starting the port + g1.m2p.set( + mcast_dst.into(), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), + ); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Multicast traffic is now detected automatically by checking if the destination + // IP is a multicast address. No router entries are needed for multicast since it + // operates at the fleet level (cross-VPC) rather than within VPC routing. + + // Build a UDP packet to the multicast address + let eth = Ethernet { + destination: MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + source: g1_cfg.guest_mac, + ethertype: Ethertype::IPV6, + }; + let ip = Ipv6 { + source: g1_cfg.ipv6().private_ip, + destination: mcast_dst, + next_header: IpProtocol::UDP, + payload_len: (Udp::MINIMUM_LENGTH) as u16, + hop_limit: 64, + ..Default::default() + }; + let udp = Udp { + source: 12345, + destination: 5353, // mDNS port as an example multicast UDP service + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + let mut pkt_m = ulp_pkt(eth, ip, udp, &[]); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt).expect("process should succeed"); + + // Verify packet was encapsulated + let Modified(spec) = res else { + panic!("Expected Modified result, got {res:?}"); + }; + let mut pkt_m = spec.apply(pkt_m); + + // Parse the encapsulated packet as inbound (it's now on the wire with Geneve) + let parsed = Packet::parse_inbound(pkt_m.iter_mut(), VpcParser {}).unwrap(); + let meta = parsed.meta(); + + // Verify the outer IPv6 destination is the multicast underlay address + assert_eq!( + meta.outer_v6.destination(), + mcast_underlay, + "Outer IPv6 destination should be multicast underlay address" + ); + + // Verify the outer IPv6 source is the physical IP of the guest + assert_eq!( + meta.outer_v6.source(), + g1_cfg.phys_ip, + "Outer IPv6 source should be the physical IP" + ); + + // Verify the outer Ethernet destination MAC is the IPv6 multicast MAC + // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the + // last 4 bytes of the IPv6 address + let expected_outer_mac = mcast_underlay.multicast_mac().unwrap(); + assert_eq!( + meta.outer_eth.destination(), + expected_outer_mac, + "Outer Ethernet MAC should be IPv6 multicast MAC" + ); + + // Verify we have Geneve encapsulation with the correct VNI (fleet multicast VNI) + assert_eq!( + meta.outer_encap.vni(), + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(), + "Geneve VNI should match DEFAULT_MULTICAST_VNI" + ); + + // Verify the Geneve multicast option is present with External replication + let replication = geneve::extract_multicast_replication(&meta.outer_encap) + .expect("Geneve packet should have multicast option"); + assert_eq!( + replication, + oxide_vpc::api::Replication::External, + "Multicast option should have External replication" + ); +} + +// Test that TCP + multicast packets are denied (TCP is incompatible with multicast) +#[test] +fn test_tcp_multicast_denied() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast address + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + g1.m2p.set( + mcast_dst.into(), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), + ); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Build a TCP packet to the multicast address (should be denied) + let mut pkt_m = http_syn3( + g1_cfg.guest_mac, + g1_cfg.ipv6().private_ip, + MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + mcast_dst, + 12345, + 80, + ); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was denied (TCP + multicast is incompatible) + assert!( + matches!( + res, + Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) + ), + "Expected Drop with Layer reason, got: {res:?}" + ); +} + +// Ensure packets with unknown critical Geneve options are rejected during +// option validation (fail-closed on unrecognised critical options). +#[test] +fn test_drop_on_unknown_critical_option() { + // Build Ethernet + IPv6 (with no extensions) + UDP + Geneve header + // carrying a single unknown critical option (class=0xffff, type=0x80, len=0). + // Minimal inner Ethernet + IPv4 + UDP follows to satisfy the parser. + let mut buf: Vec = Vec::new(); + + // Ethernet (14B) + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, // dst + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, // src + 0x86, 0xdd, // ethertype IPv6 + ]); + + // IPv6 header (40B) + // ver/tc/fl, payload_len, next_header=UDP(17), hop_limit + // payload_len = UDP length (we'll compute) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, // ver+tc+fl + 0x00, 0x00, // payload length (placeholder) + 0x11, // next header UDP + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source port + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=1 (4B option header), flags=critical opts + buf.extend_from_slice(&[ + 0x01, // ver=0, optlen=1 word (4B option header) + 0x40, // flags: critical options present + 0x65, 0x58, // protocol type 0x6558 + 0x00, 0x00, 0x00, 0x00, // VNI=0, reserved + ]); + // Unknown critical option: class=0xffff, type=0x80 (critical), len=0 + buf.extend_from_slice(&[ + 0xff, 0xff, // class + 0x80, // critical + type + 0x00, // rsvd+len=0 + ]); + // No body (len=0) + + // Minimal inner Ethernet + IPv4 + UDP (to satisfy inner parse) + buf.extend_from_slice(&[ + // inner Ethernet + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, // IPv4 + // inner IPv4 (20B) + 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, 0x0a, 0x00, + 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, // src=10.0.0.1, dst=10.0.0.2 + // inner UDP (8B) + 0x12, 0x34, 0x13, 0x37, 0x00, 0x08, 0x00, 0x00, + ]); + + // Compute UDP length and IPv6 payload length + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly from the UDP payload (skip L2/L3) and validate options + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header"); + assert!(matches!( + geneve::validate_options(&geneve), + Err(opte::engine::packet::ParseError::UnrecognisedTunnelOpt { .. }) + )); +} + +// Ensure Geneve parsing works correctly when an IPv6 extension header is present +// before UDP (e.g., Hop-by-Hop). Verifies that option walking is positioned at +// the correct Geneve offset. +#[test] +fn test_v6_ext_hdr_geneve_offset_ok() { + let mut buf: Vec = Vec::new(); + + // Ethernet + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, + 0x86, 0xdd, + ]); + + // IPv6 header (Next Header = Hop-by-Hop (0)) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, 0x00, + 0x00, // payload length (placeholder) + 0x00, // next header: Hop-by-Hop + 0x40, // hop limit + // src + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // dst + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Hop-by-Hop extension header (8B) -> next header UDP (17), hdr ext len=0 + buf.extend_from_slice(&[0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + + // UDP header (8B) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source + 0x17, 0xc1, // dest 6081 + 0x00, 0x00, // length (placeholder) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8B): ver+optlen=2 (8B option area), flags=0 + buf.extend_from_slice(&[0x02, 0x00, 0x65, 0x58, 0x00, 0x00, 0x00, 0x00]); + // Multicast option: class=0x0129, type=0x01, len=1; body=4B with External + buf.extend_from_slice(&[ + 0x01, + 0x29, + 0x01, + 0x01, // class, type, rsvd+len + (oxide_vpc::api::Replication::External as u8) << 6, + 0x00, + 0x00, + 0x00, + ]); + + // Minimal inner Ethernet + IPv4 + UDP + buf.extend_from_slice(&[ + 0x00, 0x16, 0x3e, 0x00, 0x00, 0x02, 0x00, 0x16, 0x3e, 0x00, 0x00, 0x01, + 0x08, 0x00, 0x45, 0x00, 0x00, 0x1c, 0x00, 0x01, 0x00, 0x00, 0x11, 0x00, + 0x0a, 0x00, 0x00, 0x01, 0x0a, 0x00, 0x00, 0x02, 0x12, 0x34, 0x13, 0x37, + 0x00, 0x08, 0x00, 0x00, + ]); + + // Set UDP and IPv6 payload lengths + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse Geneve directly after IPv6 ext header and UDP, then check multicast option + let geneve_offset = 14 /*eth*/ + 40 /*ipv6*/ + 8 /*hop-by-hop*/ + 8 /*udp*/; + let (geneve, _, _) = + opte::ingot::geneve::ValidGeneve::parse(&buf[geneve_offset..]) + .expect("parse geneve header after ext hdr"); + let repl = geneve::extract_multicast_replication(&geneve) + .expect("multicast option present"); + assert_eq!(repl, oxide_vpc::api::Replication::External); +} diff --git a/rustfmt.toml b/rustfmt.toml index f1d3d2fc..d5d9e9ef 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -4,3 +4,4 @@ max_width = 80 use_small_heuristics = "max" imports_granularity = "Item" style_edition = "2024" +edition = "2024" diff --git a/xde-tests/Cargo.toml b/xde-tests/Cargo.toml index 84e0d5bd..6ca3dc3a 100644 --- a/xde-tests/Cargo.toml +++ b/xde-tests/Cargo.toml @@ -8,6 +8,7 @@ repository.workspace = true [dependencies] opte-ioctl.workspace = true +opte-test-utils.workspace = true oxide-vpc.workspace = true anyhow.workspace = true diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 2fd8a634..3c9307a3 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -5,10 +5,15 @@ // Copyright 2025 Oxide Computer Company use anyhow::Result; +use anyhow::anyhow; +use anyhow::bail; use opte_ioctl::OpteHdl; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::Direction; use oxide_vpc::api::ExternalIpCfg; @@ -21,27 +26,83 @@ use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::PhysNet; use oxide_vpc::api::Ports; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; +use oxide_vpc::api::SNat6Cfg; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::Vni; use oxide_vpc::api::VpcCfg; use rand::Rng; +use std::cell::RefCell; use std::collections::HashSet; +use std::process::Child; use std::process::Command; +use std::process::Stdio; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; use zone::Zlogin; pub use ztest::*; -/// The overlay network used in all tests. +/// Ensure a zone with the given name is not present. +/// +/// Best-effort: attempt halt and uninstall, then poll until the zone +/// disappears from `zoneadm list -cv` (bounded timeout). +fn ensure_zone_absent(name: &str) -> Result<()> { + // Try to halt if running; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "halt"]) + .stderr(Stdio::null()) + .status(); + + // Try to uninstall; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "uninstall", "-F"]) + .stderr(Stdio::null()) + .status(); + + // Poll for disappearance up to 10 seconds + let deadline = Instant::now() + Duration::from_secs(10); + loop { + let out = Command::new("pfexec") + .arg("zoneadm") + .args(["list", "-cv"]) + .output()?; + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + if !stdout.contains(name) { + break; + } + if Instant::now() >= deadline { + bail!( + "zone '{name}' still present after uninstall attempts; stdout: {stdout}" + ); + } + std::thread::sleep(Duration::from_millis(100)); + } + + Ok(()) +} + +/// The IPv4 overlay network used in all tests. pub const OVERLAY_NET: &str = "10.0.0.0/24"; -/// The overlay OPTE gateway used in all tests. +/// The IPv4 overlay OPTE gateway used in all tests. pub const OVERLAY_GW: &str = "10.0.0.254"; +/// The IPv6 overlay network used in all tests. +pub const OVERLAY_NET_V6: &str = "fd00::/64"; +/// The IPv6 overlay OPTE gateway used in all tests. +pub const OVERLAY_GW_V6: &str = "fd00::254"; /// This is a wrapper around the ztest::Zone object that encapsulates common /// logic needed for running the OPTE tests zones used in this test suite. @@ -54,19 +115,95 @@ impl OpteZone { /// of interfaces. In illumos parlance, the interfaces are data link /// devices. fn new(name: &str, zfs: &Zfs, ifx: &[&str], brand: &str) -> Result { + // Ensure any prior zone with this name is fully removed before creating + // a new one, to avoid flakes from leftover state. + let _ = ensure_zone_absent(name); let zone = Zone::new(name, brand, zfs, ifx, &[])?; Ok(Self { zone }) } - /// Wait for the network to come up, then set up the overlay network. + /// Wait for the network to come up, then set up the IPv4 overlay network. fn setup(&self, devname: &str, addr: String) -> Result<()> { self.zone.wait_for_network()?; + // Configure IPv4 with static address (immediate, no DHCP wait) + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {addr}/24 {devname}/test" + ))?; + + self.zone.zexec(&format!("route add -iface {OVERLAY_GW} {addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + // Add multicast route so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; + Ok(()) + } + + /// Wait for the network to come up, then set up dual-stack (IPv4 + IPv6) overlay network. + fn setup_dualstack( + &self, + devname: &str, + ipv4_addr: String, + ipv6_addr: String, + ) -> Result<()> { + self.zone.wait_for_network()?; + // Configure IPv4 with static address (immediate, no DHCP wait) + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {ipv4_addr}/24 {devname}/testv4" + ))?; self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {}/test", devname))?; - self.zone - .zexec(&format!("route add -iface {} {}", OVERLAY_GW, addr))?; + .zexec(&format!("route add -iface {OVERLAY_GW} {ipv4_addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + + // Configure IPv6 with static address + // Use addrconf first to enable IPv6 on the interface, then add static address + self.zone.zexec(&format!( + "ipadm create-addr -t -T addrconf {devname}/addrconf" + ))?; + // Small delay to let addrconf initialize + std::thread::sleep(Duration::from_millis(500)); + self.zone.zexec(&format!( + "ipadm create-addr -t -T static -a {ipv6_addr}/64 {devname}/testv6" + ))?; + self.zone.zexec(&format!( + "route add -inet6 -iface {OVERLAY_GW_V6} {ipv6_addr}" + ))?; + self.zone.zexec(&format!( + "route add -inet6 {OVERLAY_NET_V6} {OVERLAY_GW_V6}" + ))?; + // Add multicast routes so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; self.zone - .zexec(&format!("route add {} {}", OVERLAY_NET, OVERLAY_GW))?; + .zexec(&format!("route add -inet6 ff04::/16 {OVERLAY_GW_V6}"))?; + Ok(()) + } + + /// Send a single UDP packet (IPv4) from this zone using netcat. + /// Pins the source address with `-s` for deterministic egress selection. + pub fn send_udp_v4( + &self, + src_ip: &str, + dst_ip: &str, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; + Ok(()) + } + + /// Send a single UDP packet (IPv6) from this zone using netcat. + /// Uses `-s` with the IPv6 source for deterministic egress. + /// Avoids `-6` for illumos netcat compatibility (destination selects family). + pub fn send_udp_v6( + &self, + src_ip: &str, + dst_ip: &str, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; Ok(()) } } @@ -77,6 +214,7 @@ impl OpteZone { pub struct OptePort { name: String, cfg: VpcCfg, + mcast_subscriptions: RefCell>, } impl OptePort { @@ -106,12 +244,67 @@ impl OptePort { }), guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + phys_ip: phys_ip.parse().unwrap(), + }; + let adm = OpteHdl::open()?; + adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) + } + + /// Create a new OPTE port with dual-stack (IPv4 + IPv6) support. + pub fn new_dualstack( + name: &str, + private_ip_v4: &str, + private_ip_v6: &str, + guest_mac: &str, + phys_ip: &str, + ) -> Result { + let cfg = VpcCfg { + ip_cfg: IpCfg::DualStack { + ipv4: Ipv4Cfg { + vpc_subnet: OVERLAY_NET.parse().unwrap(), + private_ip: private_ip_v4.parse().unwrap(), + gateway_ip: OVERLAY_GW.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat4Cfg { + external_ip: "1.2.3.4".parse().unwrap(), + ports: 1000..=2000, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + ipv6: Ipv6Cfg { + vpc_subnet: OVERLAY_NET_V6.parse().unwrap(), + private_ip: private_ip_v6.parse().unwrap(), + gateway_ip: OVERLAY_GW_V6.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat6Cfg { + external_ip: "2001:db8::1".parse().unwrap(), + ports: 4097..=8192, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + }, + guest_mac: guest_mac.parse().unwrap(), + gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), }; let adm = OpteHdl::open()?; adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; - Ok(OptePort { name: name.into(), cfg }) + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) } /// Add an overlay routing entry to this port. @@ -119,7 +312,7 @@ impl OptePort { let adm = OpteHdl::open()?; adm.add_router_entry(&AddRouterEntryReq { port_name: self.name.clone(), - dest: IpCidr::Ip4(format!("{}/32", dest).parse().unwrap()), + dest: IpCidr::Ip4(format!("{dest}/32").parse().unwrap()), target: RouterTarget::Ip(dest.parse().unwrap()), class: RouterClass::System, })?; @@ -150,11 +343,20 @@ impl OptePort { self.cfg.guest_mac.bytes() } - /// Return the guest IP address as a string. + /// Return the guest IPv4 address as a string. pub fn ip(&self) -> String { match &self.cfg.ip_cfg { IpCfg::Ipv4(cfg) => cfg.private_ip.to_string(), - _ => panic!("expected ipv4 guest"), + IpCfg::DualStack { ipv4, .. } => ipv4.private_ip.to_string(), + _ => panic!("expected ipv4 or dualstack guest"), + } + } + + /// Return the guest IPv6 address as a string (for dual-stack ports). + pub fn ipv6(&self) -> Option { + match &self.cfg.ip_cfg { + IpCfg::DualStack { ipv6, .. } => Some(ipv6.private_ip.to_string()), + _ => None, } } @@ -162,6 +364,52 @@ impl OptePort { pub fn underlay_ip(&self) -> std::net::Ipv6Addr { self.cfg.phys_ip.into() } + + /// Return the port name. + pub fn name(&self) -> &str { + &self.name + } + + /// Subscribe this port to a multicast group. + /// Automatically tracks the subscription for cleanup on drop. + pub fn subscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_subscribe(&McastSubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().push(group); + Ok(()) + } + + /// Unsubscribe this port from a multicast group. + pub fn unsubscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().retain(|g| *g != group); + Ok(()) + } + + /// Allow multicast CIDR traffic for this port. + /// + /// Multicast is handled automatically by the gateway layer, so we just + /// need to allow the CIDR through the firewall in both directions. + pub fn add_multicast_router_entry(&self, cidr: IpCidr) -> Result<()> { + // Allow multicast traffic in both directions + self.allow_cidr(cidr, Direction::In)?; + self.allow_cidr(cidr, Direction::Out)?; + Ok(()) + } + + /// Allow multicast CIDR through the overlay firewall for the given direction. + pub fn allow_cidr(&self, cidr: IpCidr, direction: Direction) -> Result<()> { + let adm = OpteHdl::open()?; + adm.allow_cidr(&self.name, cidr, direction)?; + Ok(()) + } } impl Drop for OptePort { @@ -170,12 +418,29 @@ impl Drop for OptePort { let adm = match OpteHdl::open() { Ok(adm) => adm, Err(e) => { - eprintln!("failed to open xde device on drop: {}", e); + eprintln!("failed to open xde device on drop: {e}"); return; } }; + + // Clean up multicast subscriptions + // Note: unsubscribe is now idempotent with respect to M2P mappings, + // so we only need to handle actual errors (e.g., port doesn't exist) + let subscriptions = self.mcast_subscriptions.borrow().clone(); + for group in subscriptions { + if let Err(e) = adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + }) { + let name = &self.name; + eprintln!( + "failed to unsubscribe {name} from multicast group {group}: {e}" + ); + } + } + if let Err(e) = adm.delete_xde(&self.name) { - eprintln!("failed to delete xde on drop: {}", e); + eprintln!("failed to delete xde on drop: {e}"); } } } @@ -202,26 +467,172 @@ impl Xde { phys: PhysNet { ether: ether.parse().unwrap(), ip: ip.parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), }, })?; Ok(()) } } impl Drop for Xde { - /// When this object is dropped, remove the xde kernel module from the - /// underlying system. fn drop(&mut self) { - // The module can no longer be successfully removed until the underlay - // has been cleared. This may not have been done, so this is fallible. + // Clear underlay to release references to simnet/vnic devices, + // allowing their cleanup to proceed. Driver remains loaded. if let Ok(adm) = OpteHdl::open() { - let _ = adm.clear_xde_underlay(); + if let Err(e) = adm.clear_xde_underlay() { + eprintln!("failed to clear xde underlay: {e}"); + } } + } +} - let mut cmd = Command::new("pfexec"); - cmd.args(["rem_drv", "xde"]); - if let Err(e) = cmd.output() { - eprintln!("failed to remove xde driver: {}", e); +/// Helper to run `snoop` and ensure it doesn't outlive the test. +/// +/// This avoids leaked `snoop` processes pinning DLPI devices (causing EBUSY) +/// when tests time out. +pub struct SnoopGuard { + child: Option, +} + +impl SnoopGuard { + /// Start a `snoop` capture on `dev_name` with the provided packet `filter`. + /// Filter syntax matches snoop conventions (e.g., "udp and port 5353"). + /// Captures a single packet (`-c 1`) and dumps hex output (`-x0`). + /// Uses `-r` to disable name resolution for deterministic numeric output. + pub fn start(dev_name: &str, filter: &str) -> anyhow::Result { + let child = Command::new("pfexec") + .args([ + "snoop", "-r", "-d", dev_name, "-c", "1", "-P", "-x0", filter, + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + Ok(Self { child: Some(child) }) + } + + /// Wait for completion with a timeout. Returns stdout if successful. + pub fn wait_with_timeout( + &mut self, + timeout: Duration, + ) -> anyhow::Result { + let deadline = Instant::now() + timeout; + + loop { + let child = self.child.as_mut().expect("child already taken"); + match child.try_wait()? { + Some(_status) => { + // Child exited; collect output. + let child = self.child.take().expect("child already taken"); + return Ok(child.wait_with_output()?); + } + None => { + if Instant::now() >= deadline { + // Timed out; kill snoop so it doesn't hold interfaces open. + let _ = child.kill(); + let _ = child.wait(); + bail!("snoop capture timed out"); + } + std::thread::sleep(Duration::from_millis(50)); + } + } + } + } +} + +impl Drop for SnoopGuard { + fn drop(&mut self) { + if let Some(child) = &mut self.child + && let Ok(None) = child.try_wait() + { + let _ = child.kill(); + let _ = child.wait(); + } + } +} + +/// Ensure the host has an IPv6 multicast route for admin-local scope +/// (ff04::/16) pointing to the provided interface. This helps the underlay +/// forwarding tests route multicast packets deterministically. +/// +/// Returns Ok even if the route already exists or if the command fails at +/// runtime; logs a warning on non-successful route add attempts. +pub fn ensure_underlay_admin_scoped_route_v6(interface: &str) -> Result<()> { + let out = std::process::Command::new("pfexec") + .args(["route", "add", "-inet6", "ff04::/16", "-iface", interface]) + .output()?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + // Treat "File exists" as benign; otherwise, just warn and continue. + if !stderr.to_lowercase().contains("file exists") { + eprintln!( + "Warning: failed to add IPv6 multicast route ff04::/16 on {interface}: {stderr}" + ); + } + } + Ok(()) +} + +/// Global multicast group state that cleans up M2P mappings and forwarding +/// entries on drop. Port-specific subscriptions are handled automatically by +/// [`OptePort::drop()`]. +/// +/// Use this to set up multicast groups in tests. Port subscriptions should use +/// `port.subscribe_multicast(group)` which tracks cleanup automatically. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. +pub struct MulticastGroup { + pub group: IpAddr, + pub underlay: MulticastUnderlay, +} + +impl MulticastGroup { + pub fn new(group: IpAddr, underlay: MulticastUnderlay) -> Result { + let hdl = OpteHdl::open()?; + hdl.set_m2p(&SetMcast2PhysReq { group, underlay })?; + Ok(Self { group, underlay }) + } + + /// Set multicast forwarding entries for this group. + pub fn set_forwarding( + &self, + next_hops: Vec<( + oxide_vpc::api::NextHopV6, + oxide_vpc::api::Replication, + )>, + ) -> Result<()> { + let hdl = OpteHdl::open()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { + underlay: self.underlay, + next_hops, + })?; + Ok(()) + } +} + +impl Drop for MulticastGroup { + fn drop(&mut self) { + let Ok(hdl) = OpteHdl::open() else { + eprintln!("failed to open xde device for multicast cleanup"); + return; + }; + + // Clear forwarding entry + let underlay = self.underlay; + if let Err(e) = hdl.clear_mcast_fwd(&ClearMcastForwardingReq { + underlay: self.underlay, + }) { + eprintln!( + "failed to clear multicast forwarding for {underlay}: {e}" + ); + } + + // Clear M2P mapping + let group = self.group; + if let Err(e) = hdl.clear_m2p(&ClearMcast2PhysReq { + group: self.group, + underlay: self.underlay, + }) { + eprintln!("failed to clear M2P mapping for {group}: {e}"); } } } @@ -244,6 +655,9 @@ impl TestNode { /// A topology of local zones interconnected with simlinks over /// an OPTE dataplane. // Note: these fields have a *very* sensitive drop order. +// Rust drops fields in declaration order. Zones must drop FIRST (to release +// references to network devices), then network infrastructure can clean up. +// Drop order: nodes -> null_ports -> v6_routes -> xde -> lls -> vnics -> simnet -> zfs pub struct Topology { pub nodes: Vec, pub null_ports: Vec, @@ -288,6 +702,14 @@ pub struct Topology { /// sanity checker to make sure basic opte/xde functionality is working - and /// that we're not hitting things like debug asserts in the OS. pub fn two_node_topology(brand: &str) -> Result { + two_node_topology_named(brand, "a", "b") +} + +pub fn two_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { // Create the "underlay loopback". With simnet device pairs, any packet that // goes in one is forwarded to the other. In the topology depicted above, // this means that anything vopte0 sends, will be encapsulated onto the @@ -318,11 +740,11 @@ pub fn two_node_topology(brand: &str) -> Result { opte0.fw_allow_all()?; // Add a host route to the underlay address of opte0, through the link local - // address of sim0 as a nexthop through sim1. This is facilitating the flow + // address of sim0 as a next hop through sim1. This is facilitating the flow // of traffic from opte1 to opte0. When a packet enters opte1 (from vopte1) // destined for 10.0.0.1, opte will look up the v2p mapping which points to // fd44::1. That is the underlay address of opte0. The route below says: - // that address is reachable through the sim1 interface, with a nexthop of + // that address is reachable through the sim1 interface, with a next hop of // the sim0 interface. In the diagram above, that is the "upward" direction // of our simnet underlay loopback. The xde device uses the kernel's routing // tables to determine which underlay device to use. With this route in @@ -349,29 +771,198 @@ pub fn two_node_topology(brand: &str) -> Result { let zfs = Arc::new(Zfs::new("opte2node")?); // Create a pair of zones to simulate our VM instances. - println!("start zone a"); - let a = OpteZone::new("a", &zfs, &[&opte0.name], brand)?; - println!("start zone b"); - let b = OpteZone::new("b", &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; - println!("setup zone a"); + println!("setup zone {zone_a_name}"); a.setup(&opte0.name, opte0.ip())?; - println!("setup zone b"); + println!("setup zone {zone_b_name}"); b.setup(&opte1.name, opte1.ip())?; Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + ], + null_ports: vec![], + v6_routes: vec![r0, r1], xde, lls: vec![ll0, ll1], vnics: vec![vn0, vn1], simnet: Some(sim), + zfs, + }) +} + +pub fn two_node_topology_dualstack(brand: &str) -> Result { + two_node_topology_dualstack_named(brand, "a", "b") +} + +pub fn two_node_topology_dualstack_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, +) -> Result { + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up v2p mappings (same as IPv4-only version) + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + + // Create dual-stack OPTE ports + let opte0 = OptePort::new_dualstack( + "opte0", + "10.0.0.1", + "fd00::1", + "a8:40:25:ff:00:01", + "fd44::1", + )?; + opte0.add_router_entry("10.0.0.2")?; + opte0.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + let opte1 = OptePort::new_dualstack( + "opte1", + "10.0.0.2", + "fd00::2", + "a8:40:25:ff:00:02", + "fd77::1", + )?; + opte1.add_router_entry("10.0.0.1")?; + opte1.fw_allow_all()?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte2node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup_dualstack(&opte0.name, opte0.ip(), "fd00::1".to_string())?; + + println!("setup zone {zone_b_name}"); + b.setup_dualstack(&opte1.name, opte1.ip(), "fd00::2".to_string())?; + + Ok(Topology { nodes: vec![ TestNode { zone: a, port: opte0 }, TestNode { zone: b, port: opte1 }, ], + null_ports: vec![], v6_routes: vec![r0, r1], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), zfs, + }) +} + +pub fn three_node_topology(brand: &str) -> Result { + three_node_topology_named(brand, "a", "b", "c") +} + +pub fn three_node_topology_named( + brand: &str, + zone_a_name: &str, + zone_b_name: &str, + zone_c_name: &str, +) -> Result { + // Create three-node topology for testing multicast fanout + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up V2P mappings for three nodes + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + Xde::set_v2p("10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + + // Create three OPTE ports + let opte0 = + OptePort::new("opte0", "10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + opte0.add_router_entry("10.0.0.2")?; + opte0.add_router_entry("10.0.0.3")?; + opte0.fw_allow_all()?; + + let opte1 = + OptePort::new("opte1", "10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + opte1.add_router_entry("10.0.0.1")?; + opte1.add_router_entry("10.0.0.3")?; + opte1.fw_allow_all()?; + + let opte2 = + OptePort::new("opte2", "10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + opte2.add_router_entry("10.0.0.1")?; + opte2.add_router_entry("10.0.0.2")?; + opte2.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + println!("adding underlay route 2"); + let r2 = + RouteV6::new(opte2.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte3node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_c_name}"); + let c = OpteZone::new(zone_c_name, &zfs, &[&opte2.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup(&opte0.name, opte0.ip())?; + + println!("setup zone {zone_b_name}"); + b.setup(&opte1.name, opte1.ip())?; + + println!("setup zone {zone_c_name}"); + c.setup(&opte2.name, opte2.ip())?; + + Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + TestNode { zone: c, port: opte2 }, + ], null_ports: vec![], + v6_routes: vec![r0, r1, r2], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), + zfs, }) } @@ -416,10 +1007,10 @@ pub fn get_linklocal_addr(link_name: &str) -> Result { let mut maybe_addr = text .lines() .nth(1) - .ok_or(anyhow::anyhow!("expected to find entry line for IP"))? + .ok_or(anyhow!("expected to find entry line for IP"))? .split_whitespace() .last() - .ok_or(anyhow::anyhow!("expected to find column for IP"))?; + .ok_or(anyhow!("expected to find column for IP"))?; // remove iface qualifier on link-local addr. if maybe_addr.contains('%') { @@ -454,7 +1045,7 @@ pub fn single_node_over_real_nic( // This is an absurd preallocation (~6MiB?) -- but it is deterministic, // and if we want to test A Lot of ports then we can. let forbidden_macs: HashSet<_> = - (&[my_info]).iter().chain(peers).map(|v| v.mac).collect(); + [my_info].iter().chain(peers).map(|v| v.mac).collect(); let mut usable_macs: Vec = (0..(1 << 20)) .filter_map(|n: u32| { let raw = n.to_be_bytes(); @@ -482,7 +1073,7 @@ pub fn single_node_over_real_nic( // VIP reuse is not an issue, we aren't using these ports for communication. null_ports.push(OptePort::new( &format!("opte{}", null_ports.len()), - &"172.20.0.1", + "172.20.0.1", &taken_mac, &underlay_addr, )?); @@ -522,13 +1113,13 @@ pub fn single_node_over_real_nic( a.setup(&opte.name, opte.ip())?; Ok(Topology { + nodes: vec![TestNode { zone: a, port: opte }], + null_ports, + v6_routes, xde, lls: vec![], vnics: vec![], simnet: None, - nodes: vec![TestNode { zone: a, port: opte }], - null_ports, - v6_routes, zfs, }) } diff --git a/xde-tests/tests/loopback.rs b/xde-tests/tests/loopback.rs index c64990a8..4ceb8b52 100644 --- a/xde-tests/tests/loopback.rs +++ b/xde-tests/tests/loopback.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company use anyhow::Result; diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs new file mode 100644 index 00000000..ab3a0086 --- /dev/null +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -0,0 +1,609 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast multiple subscriber tests. +//! +//! These validate Tx fanout and forwarding semantics across replication modes: +//! - Same-sled delivery (DELIVER action) is based purely on subscriptions and +//! independent of Replication mode set for Tx. +//! - External replication sends Geneve to the multicast underlay address for +//! delivery to the boundary switch, which then replicates to front-panel ports. +//! - Underlay replication sends Geneve to ff04::/16 multicast address for +//! sled-to-sled delivery; receiving sleds perform same-sled delivery based on +//! local subscriptions. +//! - "Both" replication instructs Tx to set bifurcated replication flags +//! (External + Underlay) in the Geneve header for switch-side handling, while +//! same-sled delivery still occurs independently based on subscriptions. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use opte_test_utils::geneve_verify; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_multicast_multiple_local_subscribers() -> Result<()> { + // Create 3-node topology to test local fanout + let topol = xde_tests::three_node_topology_named( + "omicron1", "mlsa", "mlsb", "mlsc", + )?; + + // IPv4 multicast group: 224.1.2.3 + let mcast_group = Ipv4Addr::from([224, 1, 2, 3]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 3, + ])) + .unwrap(); + + // Set up multicast state with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + // + // Note: This is a single-sled test - all nodes share one underlay. + // In production, XDE would route toward this switch address to determine the + // underlay port/MAC, but the packet dst would be the multicast address. + // This test validates packet formatting, not actual multi-sled routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with External replication mode. + // Tx behavior: packet sent to underlay with Replication::External flag. + // In production, switch receives this flag and replicates to front-panel ports. + // Rx behavior: same-sled delivery is controlled by subscriptions, independent + // of the Replication mode. + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Assert subscription table reflects all three subscribers + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + + // Start snoops on nodes B and C using SnoopGuard + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + // Also snoop underlay to verify unicast Geneve Tx to boundary + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "fanout test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for both snoops to capture packets + let snoop_output_b = snoop_b.wait_with_timeout(Duration::from_secs(5))?; + let snoop_output_c = snoop_c.wait_with_timeout(Duration::from_secs(5))?; + + let stdout_b = String::from_utf8_lossy(&snoop_output_b.stdout); + assert!( + snoop_output_b.status.success() && stdout_b.contains("UDP"), + "Expected to capture multicast UDP packet on node B, snoop output:\n{stdout_b}" + ); + + let stdout_c = String::from_utf8_lossy(&snoop_output_c.stdout); + assert!( + snoop_output_c.status.success() && stdout_c.contains("UDP"), + "Expected to capture multicast UDP packet on node C, snoop output:\n{stdout_c}" + ); + + // Verify underlay multicast forwarding (External mode) + // Parse the captured Geneve packet and assert: + // - VNI == DEFAULT_MULTICAST_VNI + // - Outer IPv6 dst == mcast_underlay (multicast group) + // - Replication == External + // Note: In production, the switch would see this External tag and replicate + // to front panel. This test verifies the Geneve header is correctly formed. + let snoop_underlay_out = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + let stdout_underlay = String::from_utf8_lossy(&snoop_underlay_out.stdout); + assert!( + snoop_underlay_out.status.success() && stdout_underlay.contains("UDP"), + "Expected to capture Geneve packet on underlay for External replication, output:\n{stdout_underlay}" + ); + + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), + "External replication should use multicast address (outer IPv6 dst)" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::External), + "Geneve replication mode should be External" + ); + + Ok(()) +} + +#[test] +fn test_multicast_underlay_replication() -> Result<()> { + // Create 2-node topology to test Underlay replication mode + let topol = xde_tests::two_node_topology_named("omicron1", "ura", "urb")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 4]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 4, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + let hdl = OpteHdl::open()?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with Underlay replication mode. + // Tx behavior: forward to underlay with multicast encapsulation. + // Rx behavior: same-sled delivery to subscribers (none in this test). + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + // + // Note: We deliberately do NOT subscribe any nodes. This tests Tx forwarding + // with zero local subscribers (Rx delivery is based on subscriptions, not + // Replication) + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Assert there are no local subscribers for this group + let subs = hdl.dump_mcast_subs()?; + assert!( + !subs.entries.iter().any(|e| e.underlay == mcast_underlay), + "expected no local subscribers for {mcast_underlay}, got: {:?}", + subs.entries + ); + + // Add IPv6 multicast route for admin-scoped multicast (ff04::/16) + // This tells the kernel to route multicast packets through the underlay interface + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; + + // Start snoop on the UNDERLAY simnet device (not the OPTE port) + // to verify the packet is forwarded to the underlay + let underlay_dev = "xde_test_sim1"; // Underlay device + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; // Geneve port + + // Also snoop node B's OPTE port to verify NO local delivery with Underlay mode + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + // Clear UFT right before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "underlay test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for snoop to capture the underlay packet (one send expected) + let snoop_output_underlay = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected to capture Geneve packet on underlay, snoop output:\n{stdout_underlay}" + ); + + // Verify Geneve header fields (VNI, outer IPv6 dst, replication mode) + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), + "Outer IPv6 dst should be multicast underlay address" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::Underlay), + "Geneve replication mode should be Underlay" + ); + + // Verify NO same-sled delivery (no subscribers = no delivery) + // Note: Rx delivery is independent of Replication mode - it's based on subscriptions + if let Ok(output) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&output.stdout); + panic!( + "Expected no same-sled delivery (zero subscribers), but captured:\n{stdout}" + ); + } + + // Leaf-only Rx assertion: start a second underlay snoop and ensure there + // is no additional multicast re-relay after Rx. We expect only the single + // Tx underlay packet captured above. + let mut snoop_underlay_2 = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + if let Ok(out) = snoop_underlay_2.wait_with_timeout(Duration::from_secs(2)) + { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "Expected leaf-only Rx (no further underlay relay), got:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_multicast_both_replication() -> Result<()> { + // Test "Both" replication mode: validates that egress Tx (External + Underlay) + // and local same-sled delivery both occur. + let topol = + xde_tests::three_node_topology_named("omicron1", "ara", "arb", "arc")?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 5]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 5, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with "Both" replication (drives egress encapsulation only) + // Tx behavior: packet sent to underlay with Replication::Both flag set. + // In production, switch receives this and bifurcates: External (to front panel) + // + Underlay (sled-to-sled multicast). + // Rx behavior: same-sled local delivery occurs independently, driven purely by + // port subscriptions (not the replication mode). + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Both, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Assert subscription table reflects all three subscribers + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + + // Start snoop on node B (local delivery) and underlay (underlay forwarding) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Send multicast packet from node A + let payload = "all replication test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for both snoops to capture packets + let snoop_output_local = + snoop_local.wait_with_timeout(Duration::from_secs(5))?; + let snoop_output_underlay = + snoop_underlay.wait_with_timeout(Duration::from_secs(5))?; + + // Verify same-sled local delivery (DELIVER action based on subscription) + let stdout_local = String::from_utf8_lossy(&snoop_output_local.stdout); + assert!( + snoop_output_local.status.success() && stdout_local.contains("UDP"), + "Expected same-sled delivery to subscribed node B, snoop output:\n{stdout_local}" + ); + + // Verify egress underlay forwarding with "Both" replication flag + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + assert!( + snoop_output_underlay.status.success() + && stdout_underlay.contains("UDP"), + "Expected egress underlay packet with 'Both' replication, snoop output:\n{stdout_underlay}" + ); + + // Parse the Geneve packet and verify the "Both" replication flag is set + let hex_str = geneve_verify::extract_snoop_hex(&stdout_underlay) + .expect("Failed to extract hex from snoop output"); + let packet_bytes = geneve_verify::parse_snoop_hex(&hex_str) + .expect("Failed to parse hex string"); + let geneve_info = geneve_verify::parse_geneve_packet(&packet_bytes) + .expect("Failed to parse Geneve packet"); + + assert_eq!( + geneve_info.vni, vni, + "Geneve VNI should be DEFAULT_MULTICAST_VNI ({})", + DEFAULT_MULTICAST_VNI + ); + assert_eq!( + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(mcast_underlay), + "Outer IPv6 dst should be multicast underlay address" + ); + assert_eq!( + geneve_info.replication, + Some(Replication::Both), + "Geneve replication mode should be Both" + ); + + Ok(()) +} + +#[test] +fn test_partial_unsubscribe() -> Result<()> { + // Test selective unsubscribe: subscribe 3 nodes, unsubscribe 1, verify + // only the remaining 2 receive packets while forwarding state is unchanged. + let topol = + xde_tests::three_node_topology_named("omicron1", "pua", "pub", "puc")?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 6]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 6, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + let hdl = OpteHdl::open()?; + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry"); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected all 3 ports subscribed initially; got {:?}", + s_entry.ports + ); + + // Send packet and verify B and C receive (A is sender, won't receive its own) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload = "all three"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // B and C should receive (A is sender, won't see its own packet) + let snoop_b_out = snoop_b.wait_with_timeout(Duration::from_secs(5))?; + let snoop_c_out = snoop_c.wait_with_timeout(Duration::from_secs(5))?; + + assert!( + String::from_utf8_lossy(&snoop_b_out.stdout).contains("UDP"), + "Node B should receive first packet" + ); + assert!( + String::from_utf8_lossy(&snoop_c_out.stdout).contains("UDP"), + "Node C should receive first packet" + ); + + // Unsubscribe node B (middle node) + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Verify subscription table now shows only A and C + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("subscription entry should still exist"); + assert!( + s_entry2.ports.contains(&p0) && s_entry2.ports.contains(&p2), + "expected p0 and p2 to remain subscribed; got {:?}", + s_entry2.ports + ); + assert!( + !s_entry2.ports.contains(&p1), + "expected p1 to be unsubscribed; got {:?}", + s_entry2.ports + ); + + // Verify forwarding table unchanged (forwarding is independent of local subs) + let fwd = hdl.dump_mcast_fwd()?; + let fwd_entry = fwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("forwarding entry should still exist"); + assert!( + fwd_entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::External + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni + }), + "forwarding table should be unchanged" + ); + + // Send another packet - only C should receive (A is sender, B unsubscribed) + let mut snoop_b2 = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c2 = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload2 = "only two"; + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload2, + )?; + + // C should receive + let snoop_c2_out = snoop_c2.wait_with_timeout(Duration::from_secs(5))?; + assert!( + String::from_utf8_lossy(&snoop_c2_out.stdout).contains("UDP"), + "Node C should receive second packet" + ); + + // B should NOT receive (timeout expected) + if let Ok(out) = snoop_b2.wait_with_timeout(Duration::from_millis(800)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("Node B should not receive after unsubscribe; got:\n{stdout}"); + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs new file mode 100644 index 00000000..13fd59bb --- /dev/null +++ b/xde-tests/tests/multicast_rx.rs @@ -0,0 +1,395 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast Rx-path tests. +//! +//! These validate that: +//! - Control-plane config (M2P map + forwarding) drives Tx encapsulation only. +//! - Same-sled delivery is based purely on subscriptions and is independent of +//! the Replication mode set for Tx. +//! - Underlay multicast uses admin-local IPv6 (ff04::/16) and routes via the +//! host underlay interface. +//! - Packets received from the underlay are delivered to subscribed ports and +//! include the expected protocol and payload characteristics. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_xde_multicast_rx_ipv4() -> Result<()> { + // Create 2-node topology (IPv4 overlay: 10.0.0.0/24) + let topol = xde_tests::two_node_topology_named("omicron1", "rx4a", "rx4b")?; + + // IPv4 multicast group: 224.0.0.251 + let mcast_group = Ipv4Addr::from([224, 0, 0, 251]); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: overlay layer needs IPv6 multicast underlay address + // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() + // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address + let mcast_underlay = + MulticastUnderlay::new("ff04::e000:fb".parse().unwrap()).unwrap(); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + // OPTE uses this address to determine the underlay port (via DDM routing), + // but the actual packet destination is the multicast underlay address. + // Note: This is a single-sled test; all nodes share one underlay network. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with Underlay replication to test underlay Rx path. + // This causes packets to be sent to the underlay multicast address, then + // received back via the underlay Rx path for same-sled delivery. + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Add IPv6 multicast route so underlay packets can be routed + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; + + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target. + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + + // Add router entries for multicast (allows both In and Out directions) + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 1 should succeed"); + + // Assert subscription state via ioctl dump before sending + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) && s_entry.ports.contains(&p1), + "expected both {p0} and {p1} to be subscribed; got {:?}", + s_entry.ports + ); + + // Assert forwarding table contains expected next hop + replication + let mfwd = hdl.dump_mcast_fwd()?; + let entry = mfwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast forwarding entry for underlay group"); + assert!( + entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::Underlay + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni + }), + "expected Underlay replication to {fake_switch_addr:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Start snoop on Rx side (matches IPv6 test pattern) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_rx = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet from zone A using helper (pins source for deterministic egress) + let payload = "multicast test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for Rx snoop to capture the packet (or timeout) + let snoop_rx_output = snoop_rx.wait_with_timeout(Duration::from_secs(5))?; + + let stdout = String::from_utf8_lossy(&snoop_rx_output.stdout); + assert!( + snoop_rx_output.status.success() && !stdout.is_empty(), + "Expected to capture multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + // Protocol summary present + assert!( + stdout.contains("UDP"), + "expected UDP summary in snoop output:\n{stdout}" + ); + // Verify destination address appears in snoop output + // SnoopGuard uses -r flag, so we always get numeric addresses + assert!( + stdout.contains("224.0.0.251"), + "expected destination 224.0.0.251 in snoop output:\n{stdout}" + ); + // Payload present - check for substring in ASCII representation + assert!( + stdout.contains("test"), + "expected payload substring 'test' in ASCII portion of snoop output:\n{stdout}" + ); + // L2 dest: with current XDE/gateway pipeline, multicast Rx to guests + // is delivered with broadcast dest MAC. snoop shows 16-bit grouped hex. + assert!( + stdout.to_ascii_lowercase().contains("ffff ffff ffff"), + "expected L2 broadcast MAC 'ffff ffff ffff' in snoop output; got:\n{stdout}" + ); + + // Unsubscribe receiver and verify no further same-sled delivery + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Assert subscription table reflects unsubscribe + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry after unsubscribe"); + assert!( + !s_entry2.ports.contains(&p1), + "expected {p1} to be unsubscribed; got {:?}", + s_entry2.ports + ); + + let mut snoop2 = SnoopGuard::start(&dev_name_b, &filter)?; + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + if let Ok(out) = snoop2.wait_with_timeout(Duration::from_millis(800)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "expected no same-sled delivery after unsubscribe; snoop output:\n{stdout}" + ); + } + Ok(()) +} + +#[test] +fn test_xde_multicast_rx_ipv6() -> Result<()> { + // Create 2-node topology with dual-stack (IPv4 + IPv6) + let topol = xde_tests::two_node_topology_dualstack_named( + "omicron1", "rx6a", "rx6b", + )?; + + // IPv6 multicast group: ff04::1:3 (admin-local scope) + let mcast_group: Ipv6Addr = "ff04::1:3".parse().unwrap(); + const MCAST_PORT: u16 = 9999; + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: Use same admin-local address for underlay + let mcast_underlay = + MulticastUnderlay::new("ff04::1:3".parse().unwrap()).unwrap(); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + // OPTE uses this address to determine the underlay port (via DDM routing), + // but the actual packet destination is the multicast underlay address. + // Note: This is a single-sled test; all nodes share one underlay network. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with Underlay replication to test underlay Rx path. + // This causes packets to be sent to the underlay multicast address, then + // received back via the underlay Rx path for same-sled delivery. + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Add IPv6 multicast route so underlay packets can be routed + xde_tests::ensure_underlay_admin_scoped_route_v6("xde_test_vnic0")?; + + // Allow IPv6 multicast traffic (ff04::/16 admin-local) via Multicast target + let mcast_cidr = IpCidr::Ip6("ff04::/16".parse().unwrap()); + + // Add router entries for multicast (allows both In and Out directions) + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 1 should succeed"); + + // Get the device names for snoop + let dev_name_b = topol.nodes[1].port.name().to_string(); + + // Start snoop using SnoopGuard to ensure cleanup + let filter = format!("udp and ip6 dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet to the multicast address from zone A using netcat + // nc -6 -u: IPv6 UDP mode + // -w1: timeout after 1 second + let payload = "multicast test v6"; + let sender_v6 = topol.nodes[0] + .port + .ipv6() + .expect("dualstack port must have IPv6 address"); + topol.nodes[0].zone.send_udp_v6( + &sender_v6, + &mcast_group.to_string(), + MCAST_PORT, + payload, + )?; + + // Wait for snoop to capture the packet (or timeout) + let snoop_output = snoop.wait_with_timeout(Duration::from_secs(5))?; + + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + snoop_output.status.success() && !stdout.is_empty(), + "Expected to capture IPv6 multicast packet on {dev_name_b}, snoop output:\n{stdout}" + ); + + Ok(()) +} + +#[test] +fn test_reject_link_local_underlay_ff02() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let link_local_underlay: Ipv6Addr = "ff02::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(link_local_underlay); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + }); + assert!( + result.is_err(), + "Expected link-local underlay (ff02::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_reject_global_underlay_ff0e() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let global_underlay: Ipv6Addr = "ff0e::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(global_underlay); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + }); + assert!( + result.is_err(), + "Expected global underlay (ff0e::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_accept_admin_local_underlay_ff04() -> Result<()> { + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let admin_local = + MulticastUnderlay::new("ff04::e001:263".parse().unwrap()).unwrap(); + + // MulticastGroup::new calls set_m2p internally and cleans up on drop. + // This test verifies that admin-local (ff04::/16) addresses are accepted, + // in contrast to link-local (ff02::) and global (ff0e::) which are rejected. + let result = MulticastGroup::new(mcast_group.into(), admin_local); + assert!( + result.is_ok(), + "Expected admin-local (ff04::) underlay to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_multicast_config_no_spurious_traffic() -> Result<()> { + // Test that multicast configuration (subscriptions + forwarding entries) + // doesn't spontaneously generate traffic on the underlay when no packets + // are actually being sent. + + let topol = xde_tests::two_node_topology_named("omicron1", "lpa", "lpb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = + MulticastUnderlay::new("ff04::e001:2c8".parse().unwrap()).unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up forwarding with Underlay replication + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Snoop the underlay to verify NO spurious traffic without sending + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + // Verify NO spurious underlay traffic (we're not sending any packets) + let snoop_result = snoop_underlay.wait_with_timeout(Duration::from_secs(2)); + + match snoop_result { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + assert!( + stdout.is_empty(), + "No multicast traffic should appear on underlay without a sender:\n{stdout}" + ); + } + Err(_) => { + // Timeout is expected - no packets should appear + } + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs new file mode 100644 index 00000000..68393059 --- /dev/null +++ b/xde-tests/tests/multicast_validation.rs @@ -0,0 +1,606 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Validation tests covering multicast operations. +//! +//! These cover control‑plane validation and idempotence: +//! - Subscribing requires an M2P map unless the group is already a ff04::/16 +//! underlay address. +//! - Subscribing with non‑multicast addresses is rejected. +//! - Double subscribe is idempotent and does not duplicate delivery. +//! - Unsubscribe is idempotent and safe when not previously subscribed. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use std::time::Duration; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; + +#[test] +fn test_subscribe_without_m2p_mapping() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "nm2pa", "nm2pb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let res = topol.nodes[0].port.subscribe_multicast(mcast_group.into()); + + assert!( + res.is_err(), + "Expected error when subscribing without M2P mapping, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_ff04_direct_without_m2p() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "ff04a", "ff04b")?; + + // IPv6 admin-scoped multicast (ff04::/16) - already an underlay address + let underlay_mcast = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ])) + .unwrap(); + + let res = topol.nodes[0] + .port + .subscribe_multicast(Ipv6Addr::from(underlay_mcast).into()); + + assert!( + res.is_ok(), + "Expected ff04::/16 subscription to succeed without M2P, got error: {res:?}" + ); + + // Assert subscription present + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay_mcast) + .expect("missing multicast subscription entry for ff04 group"); + let p0 = topol.nodes[0].port.name().to_string(); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + + Ok(()) +} + +#[test] +fn test_subscribe_nonexistent_port() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + + let res = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: "this_port_does_not_exist_anywhere".to_string(), + group: mcast_group.into(), + }); + + assert!( + res.is_err(), + "Expected error when subscribing non-existent port, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_unicast_ip_as_group() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "unia", "unib")?; + let hdl = OpteHdl::open()?; + + let unicast_ip = Ipv4Addr::from([10, 0, 0, 1]); + let res = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: unicast_ip.into(), + }); + + let err = res.expect_err("Expected error when subscribing to unicast IP"); + assert!( + format!("{err:?}").contains("not a multicast address"), + "Expected 'not a multicast address' error, got: {err:?}", + ); + + Ok(()) +} + +#[test] +fn test_double_subscribe() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "dsa", "dsb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 101]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 101, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("first subscribe should succeed"); + + let res = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); + + assert!( + res.is_ok(), + "Double subscribe should be idempotent, got error: {res:?}" + ); + + let subs = OpteHdl::open()?.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + let p1 = topol.nodes[1].port.name().to_string(); + assert!( + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports + ); + + let filter = format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop = SnoopGuard::start(topol.nodes[1].port.name(), &filter)?; + + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + "test", + )?; + + let output = snoop.wait_with_timeout(Duration::from_secs(5))?; + + let stdout = String::from_utf8_lossy(&output.stdout); + + assert!( + output.status.success() && stdout.contains("UDP"), + "Should receive multicast after double subscribe:\n{stdout}" + ); + + let count = stdout.matches("UDP").count(); + assert!( + count == 1, + "Packet should be delivered once, not duplicated. Found {count} deliveries" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_never_subscribed() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "usa", "usb")?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 102]); + + let res = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: mcast_group.into(), + }); + + assert!(res.is_ok(), "Unsubscribe should be a no-op (Ok), got: {res:?}"); + + Ok(()) +} + +#[test] +fn test_subscribe_then_clear_m2p() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "sca", "scb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 103]); + const MCAST_PORT: u16 = 9999; // Avoid mDNS port 5353 + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 103, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4("224.0.0.0/4".parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + + let hdl = OpteHdl::open()?; + hdl.clear_m2p(&ClearMcast2PhysReq { group: mcast_group.into(), underlay }) + .expect("clear_m2p should succeed"); + + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = "xde_test_sim1"; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, "ip6 and udp port 6081")?; + + let sender_v4 = topol.nodes[0].port.ip(); + let res = topol.nodes[0].zone.send_udp_v4( + &sender_v4, + &mcast_group.to_string(), + MCAST_PORT, + "test", + ); + + assert!(res.is_ok(), "Send after M2P clear should succeed: {res:?}"); + + if let Ok(out) = snoop_local.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("No local delivery expected; got:\n{stdout}"); + } + + if let Ok(out) = snoop_underlay.wait_with_timeout(Duration::from_secs(2)) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "No underlay forwarding expected after M2P clear; got:\n{stdout}" + ); + } + + Ok(()) +} + +#[test] +fn test_set_mcast_fwd_rejects_non_default_vni() -> Result<()> { + let topol = xde_tests::two_node_topology_named("omicron1", "vnix", "vniy")?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 200, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a non-default VNI and multicast next hop address checks separately + let bad_vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI + 1)?; + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(fake_switch_addr, bad_vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject non-default VNI"); + Ok(()) +} + +#[test] +fn test_set_mcast_fwd_rejects_multicast_next_hop() -> Result<()> { + let _topol = + xde_tests::two_node_topology_named("omicron1", "mnhx", "mnhy")?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 201]); + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 201, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a multicast address for next hop (invalid) + let bad_next_hop: Ipv6Addr = "ff04::1".parse().unwrap(); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(bad_next_hop, vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject multicast next hop"); + Ok(()) +} + +#[test] +fn test_unsubscribe_ipv6_non_underlay_scopes() -> Result<()> { + let topol = xde_tests::two_node_topology_dualstack_named( + "omicron1", "unsv6a", "unsv6b", + )?; + let hdl = OpteHdl::open()?; + + // ff02::/16 (link-local) and ff0e::/16 (global) are rejected by set_m2p, + // so no M2P mapping can exist for these scopes. Unsubscribe should be + // idempotent and return Ok. + + let link_local: Ipv6Addr = "ff02::1:3".parse().unwrap(); + let global: Ipv6Addr = "ff0e::1:3".parse().unwrap(); + + let res_ff02 = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: link_local.into(), + }); + + assert!( + res_ff02.is_ok(), + "Unsubscribe ff02:: should be idempotent (Ok), got: {res_ff02:?}" + ); + + let res_ff0e = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: global.into(), + }); + + assert!( + res_ff0e.is_ok(), + "Unsubscribe ff0e:: should be idempotent (Ok), got: {res_ff0e:?}" + ); + + Ok(()) +} + +#[test] +fn test_multiple_nexthops_accumulate() -> Result<()> { + // Test that set_forwarding accumulates next hops like `swadm route add`: + // - Same underlay + different next hop → add + // - Same underlay + same next hop → replace replication mode + let topol = xde_tests::two_node_topology_named("omicron1", "mnha", "mnhb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 104]); + let vni = Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 104, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + let switch_a = topol.nodes[0].port.underlay_ip().into(); + let switch_b = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::External, + )])?; + + let hdl = OpteHdl::open()?; + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!(entry.next_hops.len(), 1, "Expected 1 next hop after first set"); + assert_eq!(entry.next_hops[0].0.addr, switch_a); + assert_eq!(entry.next_hops[0].1, Replication::External); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_b, vni), + Replication::Underlay, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next hops after second set" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::External, + "switch_a should have External" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should have Underlay" + ); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::Both, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next hops after updating switch_a" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::Both, + "switch_a should now have Both (updated)" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should still have Underlay" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_all() -> Result<()> { + let topol = + xde_tests::two_node_topology_named("omicron1", "ualla", "uallb")?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 105]); + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 105, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Subscribe both ports + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 0 subscribe should succeed"); + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 1 subscribe should succeed"); + + // Verify both ports are subscribed + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert_eq!( + entry.ports.len(), + 2, + "Expected 2 ports subscribed before unsubscribe_all" + ); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + assert!( + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports + ); + + // Unsubscribe all ports from the group + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!(res.is_ok(), "mcast_unsubscribe_all should succeed, got: {res:?}"); + + // Verify no ports are subscribed + let subs = hdl.dump_mcast_subs()?; + let entry = subs.entries.iter().find(|e| e.underlay == underlay); + assert!( + entry.is_none(), + "Expected no subscription entry after unsubscribe_all, found: {entry:?}" + ); + + // Verify idempotence: calling again should succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!( + res.is_ok(), + "mcast_unsubscribe_all should be idempotent, got: {res:?}" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_all_without_m2p() -> Result<()> { + let _topol = + xde_tests::two_node_topology_named("omicron1", "uanm2pa", "uanm2pb")?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 106]); + + // Without M2P mapping, unsubscribe_all should be idempotent and succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + + assert!( + res.is_ok(), + "mcast_unsubscribe_all without M2P should succeed (idempotent), got: {res:?}" + ); + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 599f1767..01d3727f 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -7,9 +7,14 @@ use crate::postbox::Postbox; use crate::xde::XdeDev; use alloc::collections::btree_map::BTreeMap; +use alloc::collections::btree_map::Entry; +use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; +use alloc::vec::Vec; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; +use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; @@ -27,8 +32,19 @@ impl VniMac { pub fn new(vni: Vni, mac: MacAddr) -> Self { VniMac(vni.as_u32(), mac_to_u64(mac)) } + + #[inline] + pub fn vni(&self) -> Vni { + Vni::new(self.0).expect("VniMac contains valid VNI") + } } +/// Shared ownership of an XDE port. +/// +/// `Arc` provides shared ownership within a `DevMap`. Safety during +/// concurrent operations comes from callers holding read locks on the `DevMap` +/// for the duration of packet processing, which prevents port removal from +/// completing while any handler is active. type Dev = Arc; /// `BTreeMap`-accelerated lookup of XDE ports. @@ -37,10 +53,23 @@ type Dev = Arc; /// pair. The former is used mostly by the control plane, and the latter by the /// data plane -- thus, querying by address provides a direct lookup. Any other /// lookups (e.g., multicast listeners) should return `FastKey`s or `&[FastKey]`s. +/// +/// Multicast subscriptions in `mcast_groups` are port-local and sled-local: +/// ports subscribe to underlay IPv6 multicast groups (ff04::/16) to receive +/// packets for overlay multicast groups. Subscriptions are independent of the +/// forwarding table and are automatically cleaned up when ports are removed. #[derive(Clone)] pub struct DevMap { devs: BTreeMap, names: BTreeMap, + /// Subscriptions keyed by underlay IPv6 multicast group (admin-scoped ff04::/16). + /// This table is sled-local and independent of any per-VPC VNI. VNI validation + /// and VPC isolation are enforced during inbound overlay decapsulation on the + /// destination port, not here. + /// + /// Rationale: multicast groups are fleet-wide; ports opt-in to receive a given + /// underlay group, and the overlay layer subsequently filters by VNI as appropriate. + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -51,7 +80,11 @@ impl Default for DevMap { impl DevMap { pub const fn new() -> Self { - Self { devs: BTreeMap::new(), names: BTreeMap::new() } + Self { + devs: BTreeMap::new(), + names: BTreeMap::new(), + mcast_groups: BTreeMap::new(), + } } /// Insert an `XdeDev`. @@ -64,11 +97,78 @@ impl DevMap { } /// Remove an `XdeDev` using its name. + /// + /// This also cleans up all multicast subscriptions for the removed port. pub fn remove(&mut self, name: &str) -> Option { let key = get_key(&self.names.remove(name)?); + + self.mcast_groups.retain(|_group, subscribers| { + subscribers.remove(&key); + !subscribers.is_empty() + }); + self.devs.remove(&key) } + /// Allow a port to receive on a given multicast group. + /// + /// This takes the underlay IPv6 multicast group address (ff04::/16). + /// Callers at the ioctl boundary may pass an overlay group; the handler + /// translates overlay→underlay via the M2P table before calling here. + pub fn mcast_subscribe( + &mut self, + name: &str, + mcast_underlay: MulticastUnderlay, + ) -> Result<(), OpteError> { + let port = self + .names + .get(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + self.mcast_groups.entry(mcast_underlay).or_default().insert(key); + + Ok(()) + } + + /// Rescind a port's ability to receive on a given multicast group. + pub fn mcast_unsubscribe( + &mut self, + name: &str, + mcast_underlay: MulticastUnderlay, + ) -> Result<(), OpteError> { + let port = self + .names + .get(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_underlay) { + set.into_mut().remove(&key); + } + + Ok(()) + } + + /// Unsubscribe all ports from a given underlay multicast group. + pub fn mcast_unsubscribe_all(&mut self, mcast_underlay: MulticastUnderlay) { + self.mcast_groups.remove(&mcast_underlay); + } + + /// Find the keys for all ports who want to receive a given multicast packet. + pub fn mcast_listeners( + &self, + mcast_underlay: &MulticastUnderlay, + ) -> Option> { + self.mcast_groups.get(mcast_underlay).map(|v| v.iter()) + } + + /// Returns true if any multicast subscribers exist on this sled. + #[inline] + pub fn has_mcast_subscribers(&self) -> bool { + !self.mcast_groups.is_empty() + } + /// Return a reference to an `XdeDev` using its address. #[inline] #[must_use] @@ -102,6 +202,12 @@ impl DevMap { /// them to a matching XDE port. /// /// Any chains without a matching port are dropped. + /// + /// Safety: Callers must hold a read lock on this `DevMap` for the duration + /// of delivery. This prevents port removal from tearing down DLS/MAC + /// resources while delivery is in progress—management operations attempting + /// to remove a port will block when trying to acquire the write lock to + /// update the map. #[inline] pub fn deliver_all(&self, postbox: Postbox) { for (k, v) in postbox.drain() { @@ -110,6 +216,22 @@ impl DevMap { } } } + + /// Dump all multicast subscriptions as a vector of (group, ports) pairs. + pub fn dump_mcast_subscriptions( + &self, + ) -> Vec<(MulticastUnderlay, Vec)> { + let mut out = Vec::new(); + for (group, subs) in self.mcast_groups.iter() { + let ports: Vec = subs + .iter() + .filter_map(|vm| self.devs.get(vm)) + .map(|d| d.devname.clone()) + .collect(); + out.push((*group, ports)); + } + out + } } #[inline(always)] diff --git a/xde/src/postbox.rs b/xde/src/postbox.rs index fa011d89..ec142a92 100644 --- a/xde/src/postbox.rs +++ b/xde/src/postbox.rs @@ -62,6 +62,12 @@ impl Postbox { pub fn drain(self) -> impl Iterator { self.boxes.into_iter() } + + /// Returns true if there are no queued deliveries. + #[inline] + pub fn is_empty(&self) -> bool { + matches!(self.boxes, Boxes::None) + } } // SAFETY: The only `!Send`/`!Sync` element in here is the `NonNull<...>`. diff --git a/xde/src/stats.rs b/xde/src/stats.rs index 53a57076..ffed3f32 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -55,9 +55,80 @@ pub struct XdeStats { out_drop_misc: KStatU64, // NOTE: tun_opt is not relevant to outbound packets -- no encapsulation // is in use. + /// The number of multicast packets delivered to local guest instances + /// on this sled (cloned packets to same-sled OPTE ports via guest_loopback). + mcast_tx_local: KStatU64, + /// The number of multicast packets forwarded to underlay multicast group + /// (encapsulated Geneve packets to other sleds). + mcast_tx_underlay: KStatU64, + /// The number of multicast packets forwarded for external replication + /// (unicast to boundary service for front panel egress). + mcast_tx_external: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during local same-sled delivery (Tx path). + mcast_tx_stale_local: KStatU64, + /// The number of multicast packets sent with no forwarding entry + /// in the mcast_fwd table (Tx path). + mcast_tx_no_fwd_entry: KStatU64, + + /// The number of multicast packets received and delivered to local guest + /// instances on this sled (decapsulated packets to same-sled OPTE ports). + mcast_rx_local: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during local same-sled delivery (Rx path). + mcast_rx_stale_local: KStatU64, + /// The number of multicast packets received with no local subscribers + /// (no matching same-sled listeners for the multicast group). + mcast_rx_no_subscribers: KStatU64, + /// The number of times a pullup operation failed during multicast Tx + /// (packet replication), causing a packet to be dropped. + mcast_tx_pullup_fail: KStatU64, + /// The number of times a pullup operation failed during multicast Rx + /// (packet delivery/relay), causing a packet to be dropped. + mcast_rx_pullup_fail: KStatU64, } impl XdeStats { + pub fn mcast_tx_local(&self) -> &KStatU64 { + &self.mcast_tx_local + } + + pub fn mcast_tx_underlay(&self) -> &KStatU64 { + &self.mcast_tx_underlay + } + + pub fn mcast_tx_external(&self) -> &KStatU64 { + &self.mcast_tx_external + } + + pub fn mcast_tx_stale_local(&self) -> &KStatU64 { + &self.mcast_tx_stale_local + } + + pub fn mcast_tx_no_fwd_entry(&self) -> &KStatU64 { + &self.mcast_tx_no_fwd_entry + } + + pub fn mcast_rx_local(&self) -> &KStatU64 { + &self.mcast_rx_local + } + + pub fn mcast_rx_stale_local(&self) -> &KStatU64 { + &self.mcast_rx_stale_local + } + + pub fn mcast_rx_no_subscribers(&self) -> &KStatU64 { + &self.mcast_rx_no_subscribers + } + + pub fn mcast_tx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_tx_pullup_fail + } + + pub fn mcast_rx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_rx_pullup_fail + } + pub fn parse_error(&self, dir: Direction, err: &ParseError) { use Direction::*; (match (dir, err) { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index b753484a..26ccf5b2 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -56,66 +56,85 @@ //! `TokenLock` to control write access. //! //! Once we have a port, things become fairly simple. Today, each port has a -//! central RWLock -- reads/writes are only held for the duration of packet +//! central RWLock, as reads/writes are only held for the duration of packet //! processing, or as long as is required to insert new rules. //! -//! ### `DevMap` views +//! ### [`DevMap`] views //! Ideally, we want the above interactions to have minimal impact on one another //! (e.g., insertion of a port should not lock out all use of the datapath). //! For this reason, we provide the datapath entrypoints with read-only shared -//! copies of the central `DevMap`. +//! copies of the central [`DevMap`]. //! * For Rx entrypoints, we allocate a `Vec>>`. Each CPU -//! on the system has its own slot within this `Vec`, such that there should -//! never be lock contention unless a port is being added/removed. The CPU ID -//! is then used as an index into this table, and the lock is held until all -//! packets are delivered (as all packet deliveries require a live `XdeDev`). -//! * For Tx entrypoints, each `XdeDev` holds an RWLock around its copy of the -//! `DevMap`. When needed for delivery, the Rx pathway acquires the read lock. -//! We prefer an RwLock here over a Mutex[] given that we can be called from -//! multiple threads, and our callers are not expected to bound to a given CPU. -//! Most packet deliveries should go via the underlay. +//! on the system has its own slot within this `Vec`, such that lock +//! contention only occurs when a port is being added/removed. The CPU ID is +//! used as an index into this table, the lock is acquired, and held for the +//! duration of packet processing (including delivery via +//! [`deliver_all()`](DevMap::deliver_all)), as all packet deliveries require +//! a live `XdeDev`. This prevents port removal from completing while any Rx +//! handler is active. +//! * For Tx entrypoints, each `XdeDev` holds a per-port `KRwLock>`. +//! - Unicast to remote host: No `DevMap` needed, packets go directly to +//! underlay. +//! - Hairpin (same-host unicast): Hold per-port `DevMap` read lock for +//! local delivery. +//! - Multicast: Hold per-port `mcast_fwd` and `DevMap` read locks for the +//! duration of Tx processing (replication + local delivery). +//! We prefer an RwLock here over a Mutex given that we can be called from +//! multiple threads, and our callers are not expected to bound to a given +//! CPU. //! -//! Holding the lock in both cases (rather than cloning out the `Arc`) has an -//! inherent risk associated, but this is necessary to ensure that no Rx/Tx -//! contexts will attempt to send a packet to a port which has been (or is being!) -//! removed. Holding a read/lock on the `DevMap` in use ensures that any found -//! port remains alive until any in-progress packet processing is complete. +//! Read locks are held for the duration of packet processing to prevent +//! use-after-free. Management operations attempting to remove a port will block +//! when acquiring the write lock to update the map, ensuring no Rx/Tx context +//! can hold references to a port while its DLS/MAC datapath is being torn down. +//! The lock hold time is bounded to packet processing duration. //! //! In the Rx case, loopback delivery or MAC->CPU oversubscription present some //! risk of contention. These are not expected paths in the product, but using //! them does not impact correctness. //! -//! The remaining locking risks are double-locking a given Rx Mutex by the same -//! thread, and re-entrant reads on a Tx RwLock without readers-starve-writers -//! configured. The first such case results in a panic, but can only happen if -//! we transit the NIC's Rx path twice in the same stack (i.e. Rx on NIC -> -//! mac_rx on the OPTE port -> ... -> loopback delivery to underlay device). -//! This should be impossible, given that any packet sent upstack by XDE must -//! have a MAC address belonging to the OPTE port. +//! The remaining locking risk is double-locking a given Rx Mutex by the same +//! thread during packet processing. This results in a panic, but can only +//! happen if we transit the NIC's Rx path twice in the same stack (i.e. Rx on +//! NIC -> mac_rx on the OPTE port -> ... -> loopback delivery to underlay +//! device). This should be impossible, given that any packet sent upstack by +//! XDE must have a MAC address belonging to the OPTE port. //! -//! The second exposes us to a deadlock if the ordering `read[xde_mc_tx] -> -//! write[ioctl] -> read[xde_mc_tx]` occurs on one lock -- the latter read -//! acquisition will block indefinitely. This is a possibility we need to -//! consciously work around. Hairpin exchanges (e.g., ARP -> ICMP ping, DHCP) -//! can lead to fairly deep stacks of the form `(ip) -> xde_mc_tx -> (ip) -> -//! xde_mc_tx -> ...` when used with zones (this is not an issue with viona, -//! which returns once packets are communicated to the guest). Thus, we *must* -//! drop the read before delivering any hairpin packets. +//! Note: +//! - We cannot afford to take the management lock ([`TokenLock`]) during any +//! dataplane operation. If a dataplane path ever needs to consult the +//! central source of truth directly, the minimally acceptable pattern is a +//! read of `state.devs.read()` (never the management token itself). In +//! practice, to further reduce contention on reader counters we avoid even +//! this by using per-CPU cached `Arc` snapshots for Rx and per-port +//! `Arc` snapshots for Tx. Both are updated by `refresh_maps()` +//! whenever the canonical map changes. +//! - Multicast forwarding state (`mcast_fwd`) follows the same model: a copy +//! is kept per-port, updated by `refresh_maps()` whenever the canonical +//! forwarding table changes. //! -//! ### `TokenLock` and `DevMap` updates +//! ### [`TokenLock`] and [`DevMap`] updates //! The `TokenLock` primitive provides us with logical mutual exclusion around -//! the underlay and the ability to modify the canonical `DevMap` -- without +//! the underlay and the ability to modify the canonical [`DevMap`] -- without //! holding a `KMutex`. Management operations made by OPTE *will* upcall -- we //! must resolve link names to IDs, and add/remove link information from DLS. //! Doing so makes an ioctl thread vulnerable to receiving signals, so other //! threads trying to take the management lock must be able to take, e.g., //! a SIGSTOP. //! -//! Whenever the central `DevMap` is modified, we iterate through each reachable -//! `XdeDev` and underlay port, and for every instance of the cloned `DevMap` we -//! write()/lock() that entry, replace it with the new contents, and drop the -//! lock. This ensures that port removal cannot fully proceed until the port is -//! no longer usable from any Tx/Rx context. +//! Whenever the central [`DevMap`] is modified, we call [`refresh_maps()`] +//! which iterates through each reachable [`XdeDev`] and underlay port. For +//! every instance of the [`DevMap`] Arc, we acquire the write lock (blocking if +//! Tx/Rx holds a read lock), swap the Arc, and release the write lock. This +//! ensures that port removal cannot fully proceed until no Tx/Rx context holds +//! references to the port. +//! +//! ### Teardown +//! When `clear_xde_underlay()` is called (after all ports have been removed), +//! all per-CPU and per-port [`DevMap`] snapshots contain no ports (updated by +//! the final `refresh_maps()` calls during port deletion). The management lock +//! ensures no concurrent modifications, allowing underlay port Arcs to be +//! safely unwrapped. use crate::dev_map::DevMap; use crate::dev_map::ReadOnlyDevMap; @@ -124,6 +143,8 @@ use crate::dls; use crate::dls::DlsStream; use crate::dls::LinkId; use crate::ioctl::IoctlEnvelope; +use crate::ip::AF_INET; +use crate::ip::AF_INET6; use crate::mac; use crate::mac::ChecksumOffloadCapabs; use crate::mac::MacClient; @@ -153,6 +174,7 @@ use crate::sys::ncpus; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; +use alloc::collections::BTreeMap; use alloc::ffi::CString; use alloc::string::String; use alloc::string::ToString; @@ -160,6 +182,7 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::ffi::CStr; use core::num::NonZeroU32; +use core::num::NonZeroUsize; use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; @@ -171,7 +194,9 @@ use illumos_sys_hdrs::*; use ingot::geneve::Geneve; use ingot::geneve::GeneveOpt; use ingot::geneve::GeneveRef; +use ingot::geneve::ValidGeneve; use ingot::types::HeaderLen; +use ingot::types::HeaderParse; use opte::ExecCtx; use opte::api::ClearLftReq; use opte::api::ClearUftReq; @@ -185,6 +210,8 @@ use opte::api::DumpUftReq; use opte::api::DumpUftResp; use opte::api::ListLayersReq; use opte::api::ListLayersResp; +use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::NoResp; use opte::api::OpteCmd; use opte::api::OpteCmdIoctl; @@ -206,12 +233,16 @@ use opte::ddi::sync::TokenLock; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; use opte::engine::NetworkImpl; +use opte::engine::ether::EtherAddr; use opte::engine::ether::Ethernet; use opte::engine::ether::EthernetRef; use opte::engine::geneve::Vni; use opte::engine::geneve::WalkOptions; use opte::engine::headers::IpAddr; +use opte::engine::ip::ValidL3; +use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v6::Ipv6Addr; +use opte::engine::ip::v6::Ipv6Ref; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; @@ -219,23 +250,38 @@ use opte::engine::parse::ValidUlp; use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastForwardingEntry; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastSubscriptionEntry; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::cfg::IpCfg; @@ -245,6 +291,7 @@ use oxide_vpc::engine::VpcParser; use oxide_vpc::engine::firewall; use oxide_vpc::engine::gateway; use oxide_vpc::engine::geneve::MssInfoRef; +use oxide_vpc::engine::geneve::OxideOptions; use oxide_vpc::engine::geneve::ValidOxideOption; use oxide_vpc::engine::nat; use oxide_vpc::engine::overlay; @@ -252,6 +299,11 @@ use oxide_vpc::engine::router; const ETHERNET_MTU: u16 = 1500; +// Type alias for multicast forwarding table: +// Maps IPv6 destination addresses to their next hop replication entries. +type McastForwardingTable = + BTreeMap>; + // Entry limits for the various flow tables. const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); const FT_LIMIT_ONE: NonZeroU32 = NonZeroU32::new(1).unwrap(); @@ -285,6 +337,82 @@ unsafe extern "C" { dst_port: uintptr_t, ); pub safe fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); + pub safe fn __dtrace_probe_mcast__tx( + af: uintptr_t, // AF_INET or AF_INET6 + inner_dst: uintptr_t, // *const Ipv4Addr or *const Ipv6Addr + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__rx( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__local__delivery( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + port: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__underlay__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); + pub safe fn __dtrace_probe_mcast__external__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); + + // Multicast control-plane probes + pub safe fn __dtrace_probe_mcast__map__set( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__map__clear( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__set( + underlay: *const oxide_vpc::api::Ipv6Addr, + count: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__clear( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__subscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__unsubscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__unsubscribe__all( + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + + // Multicast dataplane problem probes + pub safe fn __dtrace_probe_mcast__tx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__rx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__no__fwd__entry( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); } fn bad_packet_parse_probe( @@ -361,6 +489,7 @@ struct XdeState { management_lock: TokenLock, ectx: Arc, vpc_map: Arc, + m2p: Arc, v2b: Arc, devs: ReadOnlyDevMap, stats: KStatNamed, @@ -377,6 +506,10 @@ struct XdeState { struct XdeMgmt { devs: Arc>, underlay: Option, + /// XDE-wide multicast forwarding table mapping underlay multicast addresses + /// to their physical next hops with replication information. + /// Maps: Ipv6Addr (underlay multicast address) -> BTreeMap + mcast_fwd: Arc>, } #[derive(Clone)] @@ -408,10 +541,12 @@ impl XdeState { management_lock: TokenLock::new(XdeMgmt { devs: dev_map, underlay: None, + mcast_fwd: Arc::new(KRwLock::new(BTreeMap::new())), }), devs, ectx, vpc_map: Arc::new(overlay::VpcMappings::new()), + m2p: Arc::new(overlay::Mcast2Phys::new()), v2b: Arc::new(overlay::Virt2Boundary::new()), stats: KStatNamed::new("xde", "xde", XdeStats::new()) .expect("Name is well-constructed (len, no NUL bytes)"), @@ -467,6 +602,11 @@ pub struct XdeDev { // This is kept under an RwLock because we need to deliver // from potentially one or more threads unbound to a particular CPU. port_map: KRwLock>, + + // Each port has its own copy of the multicast forwarding table. + // Used in Tx path (which is not CPU-pinned), so stored per-port rather + // than per-CPU. + mcast_fwd: KRwLock>, } impl XdeDev { @@ -868,6 +1008,51 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { let resp = remove_cidr_hdlr(&mut env); hdlr_resp(&mut env, resp) } + + OpteCmd::SetMcastForwarding => { + let resp = set_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcastForwarding => { + let resp = clear_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastForwarding => { + let resp = dump_mcast_forwarding_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastSubscriptions => { + let resp = dump_mcast_subscriptions_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastSubscribe => { + let resp = mcast_subscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribe => { + let resp = mcast_unsubscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribeAll => { + let resp = mcast_unsubscribe_all_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::SetMcast2Phys => { + let resp = set_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcast2Phys => { + let resp = clear_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } } } @@ -956,6 +1141,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { req.xde_devname.clone(), &cfg, state.vpc_map.clone(), + state.m2p.clone(), port_v2p.clone(), state.v2b.clone(), state.ectx.clone(), @@ -970,6 +1156,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { underlay_capab, routes: RouteCache::default(), port_map: KRwLock::new(Default::default()), + mcast_fwd: KRwLock::new(Arc::new(BTreeMap::new())), }); let xde_ref = Arc::get_mut(&mut xde).expect("only one instance of XDE exists"); @@ -1051,6 +1238,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { token.underlay.as_ref().expect( "bailed out above if no underlay, and protected by token", ), + &token.mcast_fwd, ); } @@ -1077,15 +1265,20 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); xde }; - // Clear the port's devmap to break any cycles. + // Break potential self-reference cycles before dropping this `XdeDev` by + // resetting its per-port `DevMap` snapshot to an empty map. Otherwise, the + // `Arc` inside `port_map` may still contain an Arc back to this + // same XdeDev, keeping it (and its underlay Arc clones) alive beyond + // deletion. { - let mut pmap = xde.port_map.write(); - *pmap = Default::default(); + let mut port_map = xde.port_map.write(); + *port_map = Arc::new(DevMap::new()); } let return_port = |token: &TokenGuard<'_, XdeMgmt>, port| { @@ -1097,6 +1290,7 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, ); }; @@ -1159,23 +1353,37 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { Ok(NoResp::default()) } -/// Rebuild each entrypoint's view of the central `DevMap`. -fn refresh_maps(devs: KRwLockWriteGuard, underlay: &UnderlayState) { +/// Rebuild each entrypoint's view of the central [`DevMap`] and multicast +/// forwarding table `McastForwardingTable`. +fn refresh_maps( + devs: KRwLockWriteGuard, + underlay: &UnderlayState, + mcast_fwd: &Arc>, +) { let new_map = Arc::new(devs.clone()); - - // Update all ports' maps. - for port in devs.iter() { - let mut map = port.port_map.write(); - *map = Arc::clone(&new_map); + let new_mcast_fwd = Arc::new(mcast_fwd.read().clone()); + + // Update both underlay ports' per-CPU caches (u1 and u2). + // Each underlay port has a Vec with one entry per CPU. + let underlay_ports = + [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; + for per_cpu_map in underlay_ports { + for entry in per_cpu_map { + let mut map = entry.devs.lock(); + *map = Arc::clone(&new_map); + } } - // Update all underlays' maps. - let ports = [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; - for port in ports { - for map in port { - let mut map = map.devs.lock(); + // Update all ports' per-port maps and multicast state. + for port in new_map.iter() { + { + let mut map = port.port_map.write(); *map = Arc::clone(&new_map); } + { + let mut mcast = port.mcast_fwd.write(); + *mcast = Arc::clone(&new_mcast_fwd); + } } } @@ -1236,9 +1444,12 @@ fn clear_xde_underlay() -> Result { }); } + // Clear multicast forwarding table + token.mcast_fwd.write().clear(); + if let Some(underlay) = token.underlay.take() { // If the underlay references have leaked/spread beyond `XdeDev`s and not - // been cleaned up, we committed have a fatal programming error. + // been cleaned up, we have a fatal programming error. // We aren't using `Weak` references to these types either, so no strong // references could be created. // @@ -1273,7 +1484,7 @@ fn clear_xde_underlay() -> Result { // 2. Close the open stream handle. // The only other hold on this `DlsStream` is via `u.siphon`, which - // we just dropped. The `expect` asserts that we have consumed them + // we just dropped. The `unwrap_or_else` asserts that we have consumed them // in the correct order. Arc::into_inner(u.stream).unwrap_or_else(|| { panic!( @@ -1772,20 +1983,18 @@ fn guest_loopback_probe( fn guest_loopback( src_dev: &XdeDev, - entry_state: &DevMap, + dst_dev: &XdeDev, + port_key: VniMac, mut pkt: MsgBlk, - vni: Vni, postbox: &mut TxPostbox, ) { use Direction::*; let mblk_addr = pkt.mblk_addr(); - // Loopback now requires a reparse on loopback to account for UFT fastpath. - // When viona serves us larger packets, we needn't worry about allocing - // the encap on. - // We might be able to do better in the interim, but that costs us time. - + // Loopback requires a reparse to account for UFT fastpath. + // We might be able to do better, but the logistics in passing around + // the emitspec in lieu of "full" metadata might be a little troublesome. let parsed_pkt = match Packet::parse_inbound(pkt.iter_mut(), VpcParser {}) { Ok(pkt) => pkt, Err(e) => { @@ -1810,81 +2019,497 @@ fn guest_loopback( let flow = parsed_pkt.flow(); - let ether_dst = parsed_pkt.meta().inner_eth.destination(); - let port_key = VniMac::new(vni, ether_dst); - let maybe_dest_dev = entry_state.get_by_key(port_key); - - match maybe_dest_dev { - Some(dest_dev) => { - guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - - // We have found a matching Port on this host; "loop back" - // the packet into the inbound processing path of the - // destination Port. - match dest_dev.port.process(In, parsed_pkt) { - Ok(ProcessResult::Modified(emit_spec)) => { - let mut pkt = emit_spec.apply(pkt); - if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { - opte::engine::err!("failed to set offload info: {}", e); - } + guest_loopback_probe(mblk_addr, &flow, src_dev, dst_dev); - // Having advertised offloads to our guest, looped back - // packets are liable to have zero-checksums. Fill these - // if necessary. - let pkt = if pkt - .offload_flags() - .flags - .intersects(MblkOffloadFlags::HCK_TX_FLAGS) - { - // We have only asked for cksum emulation, so we - // will either have: - // * 0 pkts (checksum could not be emulated, - // packet dropped) - // * 1 pkt. - mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) - .and_then(|mut v| v.pop_front()) - } else { - Some(pkt) - }; + match dst_dev.port.process(In, parsed_pkt) { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut pkt = emit_spec.apply(pkt); + if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } - if let Some(pkt) = pkt { - postbox.post_local(port_key, pkt); - } - } + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .offload_flags() + .flags + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + // We have only asked for cksum emulation, so we + // will either have: + // * 0 pkts (checksum could not be emulated, + // packet dropped) + // * 1 pkt. + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|mut v| v.pop_front()) + } else { + Some(pkt) + }; - Ok(ProcessResult::Drop { reason }) => { - opte::engine::dbg!("loopback rx drop: {:?}", reason); - } + if let Some(pkt) = pkt { + postbox.post_local(port_key, pkt); + } + } - Ok(ProcessResult::Hairpin(_hppkt)) => { - // There should be no reason for an loopback - // inbound packet to generate a hairpin response - // from the destination port. - opte::engine::dbg!("unexpected loopback rx hairpin"); - } + Ok(ProcessResult::Drop { reason }) => { + opte::engine::dbg!("loopback rx drop: {:?}", reason); + } + + Ok(ProcessResult::Hairpin(_hppkt)) => { + // There should be no reason for an loopback + // inbound packet to generate a hairpin response + // from the destination port. + opte::engine::dbg!("unexpected loopback rx hairpin"); + } + + Err(e) => { + opte::engine::dbg!( + "loopback port process error: {} -> {} {:?}", + src_dev.port.name(), + dst_dev.port.name(), + e + ); + } + } +} + +/// Locate the Oxide Multicast Geneve option and return the offset to its body. +/// +/// Walks through Geneve options starting at `geneve_offset + 8` to find the +/// Oxide Multicast option (class=0x0129, type=0x01). Returns the offset to the +/// option body (after the 4-byte option header) if found. +/// +/// Safety: This function validates option headers as it walks to avoid reading +/// beyond packet boundaries. Returns `None` if the option is not found or if +/// validation fails. +/// +/// # Geneve Option Format +/// Each option consists of: +/// - 2 bytes: Option class +/// - 1 byte: Flags (bit 7=critical) + Type (bits 0-6) +/// - 1 byte: Reserved (3 bits) + Length in 4-byte words (5 bits) +/// - N bytes: Option data (N = length field * 4) +fn find_mcast_option_offset( + pkt: &MsgBlk, + geneve_offset: usize, +) -> Option { + let geneve_slice = pkt.get(geneve_offset..)?; + let (geneve_hdr, ..) = ValidGeneve::parse(geneve_slice).ok()?; + + let mut cursor = geneve_offset + Geneve::MINIMUM_LENGTH; + + for opt in OxideOptions::from_raw(&geneve_hdr) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Multicast(_)) = opt.option.known() { + return Some(cursor + GeneveOpt::MINIMUM_LENGTH); + } + cursor += opt.packet_length(); + } + + None +} - Err(e) => { +/// Update the Oxide Multicast Geneve option's Tx-only replication field. +/// +/// Locates the multicast option and rewrites the Tx-only replication instruction +/// in the first byte of the option body (top 2 bits encode the replication mode). +/// +/// Returns `true` if the option was found and updated, `false` otherwise. +/// +/// # Replication Encoding (Tx-only) +/// The replication field uses the top 2 bits of the first byte: +/// - `External` (0): 0x00 +/// - `Underlay` (1): 0x40 +/// - `All` (2): 0x80 +/// - `Reserved` (3): 0xC0 +#[inline] +fn update_mcast_replication( + pkt: &mut MsgBlk, + geneve_offset: usize, + replication: Replication, +) -> bool { + let Some(mcast_body_off) = find_mcast_option_offset(pkt, geneve_offset) + else { + return false; + }; + + let Some(rep_byte) = pkt.get_mut(mcast_body_off..mcast_body_off + 1) else { + return false; + }; + + // Encode replication in top 2 bits, preserve bottom 6 bits + let repl_bits = (replication as u8) << 6; + rep_byte[0] = (rep_byte[0] & 0x3F) | repl_bits; + true +} + +struct MulticastTxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) + vni: Vni, + out_pkt: &'a MsgBlk, + encap_len: u32, + inner_eth_len: usize, + non_eth_payl_bytes: u32, + tun_meoi: &'a illumos_sys_hdrs::mac::mac_ether_offload_info_t, + l4_hash: u32, +} + +struct MulticastRxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) + vni: Vni, + pkt: &'a MsgBlk, + pullup_len: usize, +} + +/// Handle multicast packet forwarding for same-sled delivery and underlay +/// replication based on the XDE-wide multicast forwarding table. +/// +/// Always delivers to local same-sled subscribers regardless of replication mode. +/// Routes to next hop unicast addresses for ALL replication modes to determine +/// reachability and underlay port/MAC. Packet destination is always the multicast +/// address with multicast MAC. The [`Replication`] type is a Tx-only instruction +/// telling the switch which port groups to replicate to: External (front panel), +/// Underlay (other sleds), or Both. +/// +/// `cpu_devs` may be None if the fast-path check indicated no local subscribers exist. +/// +/// [`Replication`]: oxide_vpc::api::Replication +fn handle_mcast_tx<'a>( + ctx: MulticastTxContext, + src_dev: &'a XdeDev, + postbox: &mut TxPostbox, + cpu_devs: Option<&'a DevMap>, + cpu_mcast_fwd: &'a McastForwardingTable, +) { + // DTrace probe: multicast Tx entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__tx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); + + // Compute packet offsets once (used for both local delivery and next hop forwarding) + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) + + usize::from(ctx.tun_meoi.meoi_l3hlen) + + usize::from(ctx.tun_meoi.meoi_l4hlen); + + // Local same-sled delivery: always deliver to subscribers on this sled, + // independent of the Tx-only Replication instruction (not an access control mechanism). + // The Replication type only affects how switches handle the packet on Tx. + // Subscription is keyed by underlay (outer) IPv6 multicast address. + // If cpu_devs is None, we know from the fast-path check that no subscribers exist. + if let Some(devs) = cpu_devs { + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); + if let Some(others) = devs.mcast_listeners(&group_key) { + let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); + for el in others { + // Skip delivering to self + if my_key == *el { + continue; + } + let Ok(my_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { opte::engine::dbg!( - "loopback port process error: {} -> {} {:?}", - src_dev.port.name(), - dest_dev.port.name(), - e + "mcast Tx external pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail( + pullup_len as uintptr_t, ); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + AF_INET as usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + guest_loopback(src_dev, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_local().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_stale_local().incr(1); + } } } } + } - None => { - opte::engine::dbg!( - "underlay dest is same as src but the Port was not found \ - vni = {}, mac = {}", - vni.as_u32(), - ether_dst + // Next hop forwarding: send packets to configured next hops. + // + // At the leaf level, we process all next hops in the forwarding table. + // Each next hop's `Replication` is a Tx-only instruction telling the switch + // which ports to replicate to: + // - External: ports set for external multicast traffic (egress to external networks) + // - Underlay: replicate to other sleds (using multicast outer dst) + // - Both: both external and underlay replication + // + // We already have the Arc from the per-CPU cache, no need to clone. + let underlay_key = MulticastUnderlay::new_unchecked(ctx.underlay_dst); + if cpu_mcast_fwd.get(&underlay_key).is_none() { + __dtrace_probe_mcast__no__fwd__entry( + &ctx.underlay_dst, + ctx.vni.as_u32() as uintptr_t, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_no_fwd_entry().incr(1); + } + + if let Some(next_hops) = cpu_mcast_fwd.get(&underlay_key) { + // We found forwarding entries, replicate to each next hop + for (next_hop, replication) in next_hops.iter() { + // Clone packet with headers using pullup + let Ok(mut fwd_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast Tx next hop pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail(pullup_len as uintptr_t); + continue; // Skip this destination on allocation failure + }; + + // Route to next hop unicast address to determine which underlay + // port/MAC to use. Packet destination is the multicast address with + // multicast MAC (RFC 2464). + // + // NextHopV6.addr = unicast switch address (for routing) + // Outer dst IP = ctx.underlay_dst (multicast address from M2P) + // Geneve Replication is a Tx-only instruction telling the switch + // which port groups to use. + let routing_dst = next_hop.addr; + let actual_outer_dst = ctx.underlay_dst; + + // VNI is at offset 4 in Geneve header (3 bytes) + if let Some(vni_bytes) = + fwd_pkt.get_mut(geneve_offset + 4..geneve_offset + 7) + { + let vni_be = next_hop.vni.as_u32().to_be_bytes(); + vni_bytes.copy_from_slice(&vni_be[1..4]); // VNI is 24 bits + } + // Update Geneve multicast option with the Tx-only replication + // instruction for the switch. + update_mcast_replication(&mut fwd_pkt, geneve_offset, *replication); + + // Route to switch unicast address to determine which underlay + // port/MAC to use. Packet destination is multicast address with + // multicast MAC. + let route_key = + RouteKey { dst: routing_dst, l4_hash: Some(ctx.l4_hash) }; + let Route { src: mac_src, dst: _mac_dst, underlay_idx } = + src_dev.routes.next_hop(route_key, src_dev); + + // Derive destination MAC from IPv6 multicast address per RFC 2464: + // IPv6 multicast MAC = 33:33 + last 4 bytes of IPv6 address + let ipv6_bytes = actual_outer_dst.bytes(); + let dst_mac = EtherAddr::from([ + 0x33, + 0x33, + ipv6_bytes[12], + ipv6_bytes[13], + ipv6_bytes[14], + ipv6_bytes[15], + ]); + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(dst_mac.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // Replication is a Tx-only instruction telling the switch which + // port groups to replicate to. Local same-sled delivery always + // occurs regardless of this setting. + // + // Packet is sent once to the underlay. The switch reads the Geneve + // Replication field and performs the actual bifurcation. + + // Prepare common data for DTrace probes + let outer_ip6 = + oxide_vpc::api::Ipv6Addr::from(actual_outer_dst.bytes()); + let (af, addr_ptr) = + (AF_INET6 as usize, &outer_ip6 as *const _ as uintptr_t); + + // Fire DTrace probes and increment stats based on replication mode + match replication { + oxide_vpc::api::Replication::Underlay => { + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + } + oxide_vpc::api::Replication::Both => { + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + xde.stats.vals.mcast_tx_external().incr(1); + } + oxide_vpc::api::Replication::External => { + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_external().incr(1); + } + oxide_vpc::api::Replication::Reserved => { + // Reserved: drop packet + continue; + } + } + + // Send to underlay (common for all valid replication modes) + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, ); } } } +/// Handle multicast packet reception from the underlay. +/// +/// OPTE is always a leaf node in the multicast replication tree. +/// This function only delivers packets to local subscribers. +/// +/// The Replication type is Tx-only (instructions to the switch), so the +/// replication field is ignored on Rx. Local delivery is based purely on +/// subscriptions. +fn handle_mcast_rx( + ctx: MulticastRxContext, + stream: &DlsStream, + devs: &DevMap, + postbox: &mut Postbox, +) { + // DTrace probe: multicast Rx entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__rx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); + + // Subscription is keyed by underlay (outer) IPv6 multicast address. + // This uniquely identifies the multicast group across the fleet. + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); + + // Deliver to all local subscribers. VNI validation and VPC isolation + // are handled by OPTE's inbound overlay layer. + if let Some(ports) = devs.mcast_listeners(&group_key) { + for el in ports { + let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast Rx pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + __dtrace_probe_mcast__rx__pullup__fail( + ctx.pullup_len as uintptr_t, + ); + continue; + }; + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: Rx local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_local().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_local().incr(1); + } + } + } + } else { + // No subscription entry found for this multicast group + let underlay_ip6 = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + __dtrace_probe_mcast__no__fwd__entry( + &underlay_ip6, + ctx.vni.as_u32() as uintptr_t, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_no_subscribers().incr(1); + } +} + #[unsafe(no_mangle)] unsafe extern "C" fn xde_mc_tx( arg: *mut c_void, @@ -1921,34 +2546,31 @@ unsafe extern "C" fn xde_mc_tx( let mut hairpin_chain = MsgBlkChain::empty(); let mut tx_postbox = TxPostbox::new(); - // We don't need to read-lock the port map unless we have local - // delivery to perform. - // - // TODO: really think this one through. This might expose us to the - // risk of double read-locking at the same time as the tokenlock - // wants to make some globally mutable operation happen. - // - // Maybe we should clone out the `DevMap` at this instant. - let mut entry_state = None; + // We don't need to read-lock port_map or mcast_fwd unless we actually need them. + // Locks are acquired lazily on first use and then held for the duration of + // packet processing. This prevents port removal from completing while any Tx + // handler holds references (management operations block on the write lock). + let mut port_map = None; + let mut mcast_fwd = None; while let Some(pkt) = chain.pop_front() { xde_mc_tx_one( src_dev, pkt, &mut tx_postbox, - &mut entry_state, + &mut port_map, + &mut mcast_fwd, &mut hairpin_chain, ); } let (local_pkts, [u1_pkts, u2_pkts]) = tx_postbox.deconstruct(); - if let Some(entry_state) = entry_state { - entry_state.deliver_all(local_pkts); + // Local same-sled delivery (via mac_rx to guest ports). + if let Some(port_map) = port_map { + port_map.deliver_all(local_pkts); } - // `entry_state` has been moved, making it safe to deliver hairpin - // packets (which may cause us to re-enter XDE in the same stack). // All deliver/tx calls will NO-OP if the sent chain is empty. src_dev.deliver(hairpin_chain); @@ -1972,7 +2594,8 @@ fn xde_mc_tx_one<'a>( src_dev: &'a XdeDev, mut pkt: MsgBlk, postbox: &mut TxPostbox, - entry_state: &mut Option>>, + port_map: &mut Option>>, + mcast_fwd: &mut Option>>, hairpin_chain: &mut MsgBlkChain, ) { let parser = src_dev.port.network().parser(); @@ -1999,6 +2622,18 @@ fn xde_mc_tx_one<'a>( let old_len = parsed_pkt.len(); let meta = parsed_pkt.meta(); + + // Extract inner destination IP for potential multicast processing + let inner_dst_ip = match &meta.inner_l3 { + Some(ValidL3::Ipv4(v4)) => { + Some(oxide_vpc::api::IpAddr::from(v4.destination())) + } + Some(ValidL3::Ipv6(v6)) => { + Some(oxide_vpc::api::IpAddr::from(v6.destination())) + } + None => None, + }; + let Ok(non_eth_payl_bytes) = u32::try_from((&meta.inner_l3, &meta.inner_ulp).packet_length()) else { @@ -2006,6 +2641,8 @@ fn xde_mc_tx_one<'a>( return; }; + let inner_eth_len = meta.inner_eth.packet_length(); + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2030,8 +2667,7 @@ fn xde_mc_tx_one<'a>( let port = &src_dev.port; // The port processing code will fire a probe that describes what - // action was taken -- there should be no need to add probes or - // prints here. + // action was taken. let res = port.process(Direction::Out, parsed_pkt); match res { @@ -2039,24 +2675,34 @@ fn xde_mc_tx_one<'a>( // If the outer IPv6 destination is the same as the // source, then we need to loop the packet inbound to the // guest on this same host. - let (ip6_src, ip6_dst) = match emit_spec.outer_ip6_addrs() { - Some(v) => v, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no outer IPv6 header, dropping"); - return; - } + let Some((ip6_src, ip6_dst)) = emit_spec.outer_ip6_addrs() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no outer IPv6 header, dropping"); + return; }; - let vni = match emit_spec.outer_encap_vni() { - Some(vni) => vni, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no geneve header, dropping"); - return; - } + // EmitSpec applies pushes/pops, but modifications will have occurred + // by this point. Pull destination MAC to allow us to reuse code + // between unicast & multicast loopback. + // + // Ingot will have asserted that Ethernet came first, and that it was + // contiguous. + let Some(ether_dst) = pkt + .get(..size_of::()) + .map(|v| MacAddr::from_const(v.try_into().unwrap())) + else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("couldn't re-read inner MAC, dropping"); + return; + }; + + let Some(vni) = emit_spec.outer_encap_vni() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no geneve header, dropping"); + return; }; let Some(tun_meoi) = emit_spec.encap_meoi() else { @@ -2072,9 +2718,23 @@ fn xde_mc_tx_one<'a>( let new_len = out_pkt.byte_len(); if ip6_src == ip6_dst { - let entry_state = - entry_state.get_or_insert_with(|| src_dev.port_map.read()); - guest_loopback(src_dev, entry_state, out_pkt, vni, postbox); + // Hairpin loopback: same-host delivery + let key = VniMac::new(vni, ether_dst); + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); + if let Some(dst_dev) = devs.get_by_key(key) { + // We have found a matching Port on this host; "loop back" + // the packet into the inbound processing path of the + // destination Port. + guest_loopback(src_dev, dst_dev, key, out_pkt, postbox); + } else { + opte::engine::dbg!( + "underlay dest is same as src but the Port was not found \ + vni = {}, mac = {}", + vni.as_u32(), + ether_dst + ); + } return; } @@ -2086,6 +2746,63 @@ fn xde_mc_tx_one<'a>( return; }; + // Multicast interception: All packets (unicast and multicast) go + // through normal `port.process()` which applies router/firewall + // rules and uses M2P for multicast encapsulation. Here, we + // intercept multicast packets for replication to multiple next hops + // and local delivery to subscribers. + // + // Check if this is a multicast packet by examining the outer IPv6 + // destination. For multicast, OPTE should have set it to an + // ff0x:: address (via M2P table). + if ip6_dst.is_multicast() { + // This is a multicast packet, so we determine the inner + // destination from the packet contents or use a fallback + let inner_dst = inner_dst_ip.unwrap_or_else(|| { + // Fallback: derive from outer IPv6 multicast address + // For IPv4 multicast mapped to IPv6, the last 4 bytes + // contain the IPv4 address + if ip6_dst.bytes()[0] == 0xff && ip6_dst.bytes()[1] == 0x04 + { + // Admin-scoped IPv6 multicast, likely mapped from IPv4 + let bytes = ip6_dst.bytes(); + oxide_vpc::api::IpAddr::Ip4( + oxide_vpc::api::Ipv4Addr::from([ + bytes[12], bytes[13], bytes[14], bytes[15], + ]), + ) + } else { + // Use the IPv6 multicast address directly + oxide_vpc::api::IpAddr::Ip6(ip6_dst) + } + }); + + // Acquire locks lazily on first multicast packet. + // Once acquired, locks are held for the duration of Tx processing. + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); + let fwd_table = + mcast_fwd.get_or_insert_with(|| src_dev.mcast_fwd.read()); + handle_mcast_tx( + MulticastTxContext { + inner_dst, + underlay_dst: ip6_dst, + vni, + out_pkt: &out_pkt, + encap_len, + inner_eth_len, + non_eth_payl_bytes, + tun_meoi: &tun_meoi, + l4_hash, + }, + src_dev, + postbox, + Some(devs), + fwd_table, + ); + return; + } + // 'MSS boosting' is performed here -- we set a 9k (minus overheads) // MSS for compatible TCP traffic. This is a kind of 'pseudo-GRO', // sending larger frames internally rather than having the NIC/OS @@ -2150,9 +2867,9 @@ fn xde_mc_tx_one<'a>( // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE - // for the mac associated with the IRE nexthop to fill in + // for the mac associated with the IRE next hop to fill in // the outer frame of the packet. Also return the underlay - // device associated with the nexthop + // device associated with the next hop // // As route lookups are fairly expensive, we can cache their // results for a given dst + entropy. These have a fairly tight @@ -2185,10 +2902,10 @@ fn xde_mc_tx_one<'a>( Ok(ProcessResult::Drop { .. }) => {} Ok(ProcessResult::Hairpin(hpkt)) => { - // From the theory statement, if we have a packet chain - // from above which contains a mixture of hairpin and local - // deliveries (`guest_loopback`) we can only deliver hairpin - // packets once `entry_state` is explicitly dropped. + // Hairpin packets are queued for later delivery. If we have a + // packet chain containing both hairpin and local deliveries + // (via `guest_loopback`), we defer hairpin delivery until after + // local delivery completes to avoid potential re-entrancy issues. hairpin_chain.append(hpkt); } @@ -2333,6 +3050,7 @@ fn new_port( name: String, cfg: &VpcCfg, vpc_map: Arc, + m2p: Arc, v2p: Arc, v2b: Arc, ectx: Arc, @@ -2353,10 +3071,10 @@ fn new_port( // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? - gateway::setup(&pb, &cfg, vpc_map, FT_LIMIT_ONE, dhcp_cfg)?; + gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, v2p, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, v2p, m2p, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual @@ -2368,7 +3086,8 @@ fn new_port( let limit = NonZeroU32::new(FW_FT_LIMIT.get().max(nat_ft_limit.get())).unwrap(); let net = VpcNetwork { cfg }; - Ok(Arc::new(pb.create(net, limit, limit)?)) + let port = Arc::new(pb.create(net, limit, limit)?); + Ok(port) } #[unsafe(no_mangle)] @@ -2408,17 +3127,17 @@ unsafe extern "C" fn xde_rx( let mut count = 0; let mut len = 0; - // Acquire our own dev map -- this gives us access to prebuilt postboxes - // for all active ports. We don't worry about this changing for rx -- caller - // threads here (interrupt contexts, poll threads, fanout, worker threads) - // are all bound to a given CPU each by MAC. + // Hold the read lock on the per-CPU DevMap for the duration of Rx processing. + // This prevents port removal from completing until no Rx handler holds references. + // Management operations will block briefly during lock hold, but the critical + // section is bounded to packet processing time (swap Arc during refresh). let cpu_index = current_cpu().seq_id; - let cpu_state = stream.ports_map[cpu_index].devs.lock(); + let devmap = stream.ports_map[cpu_index].devs.lock(); let mut postbox = Postbox::new(); while let Some(pkt) = chain.pop_front() { if let Some(pkt) = - xde_rx_one(&stream.stream, pkt, &cpu_state, &mut postbox) + xde_rx_one(&stream.stream, pkt, &devmap, &mut postbox) { count += 1; len += pkt.byte_len(); @@ -2426,7 +3145,7 @@ unsafe extern "C" fn xde_rx( } } - cpu_state.deliver_all(postbox); + devmap.deliver_all(postbox); let (head, tail) = out_chain .unwrap_head_and_tail() @@ -2454,10 +3173,17 @@ unsafe extern "C" fn xde_rx( head } -/// Processes an individual packet receiver on the underlay device `stream`. +/// Processes an individual packet received on the underlay device `stream`. /// /// This function returns any input `pkt` which is not of interest to XDE (e.g., /// the packet is not Geneve over v6, or no matching OPTE port could be found). +/// +/// `xde_rx_one_direct` largely replicates this function due to lifetime issues +/// around parsing, so changes here may need to be made there too. We could do this +/// with a single function using an `enum` control parameter (e.g., +/// `DoMcastCheck(&DevMap)`, `DeliverDirect(&XdeDev, VniMac)`) but we'd be +/// really reliant on rustc interpreting these as static choices and inlining +/// accordingly. #[inline] fn xde_rx_one( stream: &DlsStream, @@ -2490,6 +3216,55 @@ fn xde_rx_one( let meta = parsed_pkt.meta(); let old_len = parsed_pkt.len(); + let ip6_dst = meta.outer_v6.destination(); + if ip6_dst.is_multicast() { + // Fast path: if no multicast subscribers exist, drop immediately + if !devs.has_mcast_subscribers() { + return None; + } + + let pullup_len = ( + &meta.outer_eth, + &meta.outer_v6, + &meta.outer_udp, + &meta.outer_encap, + &meta.inner_eth, + &meta.inner_l3, + &meta.inner_ulp, + ) + .packet_length(); + debug_assert!( + pullup_len > 0, + "pullup_len should be non-zero for valid multicast packet" + ); + let vni = meta.outer_encap.vni(); + + // Extract inner destination IP for multicast processing + let inner_dst = match &meta.inner_l3 { + ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), + ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), + }; + + // Drop the parsed packet before calling handle_mcast_rx + drop(parsed_pkt); + + // Handle multicast packets, delivering to local subscribers only + // (leaf node) + handle_mcast_rx( + MulticastRxContext { + inner_dst, + underlay_dst: ip6_dst, + vni, + pkt: &pkt, + pullup_len, + }, + stream, + devs, + postbox, + ); + return None; + } + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2595,6 +3370,117 @@ fn xde_rx_one( None } +/// Processes an individual packet after multicast replication has taken place. +/// This primarily duplicates `xde_rx_one`. +/// +/// Lifetimes (arond Packet etc.) will make this difficult to simplify +/// the expression of both this and its original implementation. We could insert +/// the body using macros, but then we really lose a lot (line numbers on crash, +/// subpar rust-analyzer integration)... +#[inline] +fn xde_rx_one_direct( + stream: &DlsStream, + dev: &XdeDev, + port_key: VniMac, + mut pkt: MsgBlk, + postbox: &mut Postbox, +) { + // TODO: it would be great if we could tell Ingot 'here are all the + // layer lengths/types, please believe that they are correct'. And then + // to plumb that through `NetworkParser`. I can't say that I *like* + // doing this reparse here post-replication. + let parser = VpcParser {}; + let parsed_pkt = Packet::parse_inbound(pkt.iter_mut(), parser) + .expect("this is a reparse of a known-valid packet"); + + let meta = parsed_pkt.meta(); + let old_len = parsed_pkt.len(); + + let ulp_meoi = match meta.ulp_meoi(old_len) { + Ok(ulp_meoi) => ulp_meoi, + Err(e) => { + opte::engine::dbg!("{}", e); + return; + } + }; + + let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) + + u32::from(ulp_meoi.meoi_l3hlen) + + u32::from(ulp_meoi.meoi_l4hlen); + + // Large TCP frames include their MSS in-band, as recipients can require + // this to correctly process frames which have been given split into + // larger chunks. + // + // This will be set to a nonzero value when TSO has been asked of the + // source packet. + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let recovered_mss = if is_tcp { + let mut out = None; + for opt in WalkOptions::from_raw(&meta.outer_encap) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Mss(el)) = opt.option.known() { + out = NonZeroU32::new(el.mss()); + break; + } + } + out + } else { + None + }; + + // We are in passthrough mode, skip OPTE processing. + if dev.passthrough { + drop(parsed_pkt); + postbox.post(port_key, pkt); + return; + } + + let port = &dev.port; + + let res = port.process(Direction::In, parsed_pkt); + + match res { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut npkt = emit_spec.apply(pkt); + let len = npkt.byte_len(); + let pay_len = len + - usize::try_from(non_payl_bytes) + .expect("usize > 32b on x86_64"); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if let Some(mss) = recovered_mss + // This packet could be the last segment of a split frame at + // which point it could be smaller than the original MSS. + // Don't re-tag the MSS if so, as guests may be confused and + // MAC emulation will reject the packet if the guest does not + // support GRO. + && pay_len > usize::try_from(mss.get()).expect("usize > 32b on x86_64") + { + npkt.request_offload(MblkOffloadFlags::HW_LSO, mss.get()); + } + + if let Err(e) = npkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } + + postbox.post(port_key, npkt); + } + Ok(ProcessResult::Hairpin(hppkt)) => { + stream.tx_drop_on_no_desc( + hppkt, + TxHint::NoneOrMixed, + MacTxFlags::empty(), + ); + } + _ => {} + } +} + #[unsafe(no_mangle)] fn add_router_entry_hdlr(env: &mut IoctlEnvelope) -> Result { let req: AddRouterEntryReq = env.copy_in_req()?; @@ -2682,6 +3568,68 @@ fn dump_v2p_hdlr() -> Result { Ok(state.vpc_map.dump()) } +#[unsafe(no_mangle)] +fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: SetMcast2PhysReq = env.copy_in_req()?; + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let state = get_xde_state(); + state.m2p.set(req.group, underlay); + + // DTrace: multicast map set + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__map__set( + af as uintptr_t, + group_ptr, + &underlay.addr(), + vni.as_u32() as uintptr_t, + ); + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: ClearMcast2PhysReq = env.copy_in_req()?; + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let state = get_xde_state(); + state.m2p.remove(&req.group); + + // DTrace: multicast map clear + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__map__clear( + af as uintptr_t, + group_ptr, + &underlay.addr(), + vni.as_u32() as uintptr_t, + ); + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn set_v2b_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetVirt2BoundaryReq = env.copy_in_req()?; @@ -2704,6 +3652,358 @@ fn dump_v2b_hdlr() -> Result { Ok(state.v2b.dump()) } +#[unsafe(no_mangle)] +fn set_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: SetMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication modes. + // NextHopV6.addr must be unicast (switch address for routing). + // The packet will be sent to the multicast address (req.underlay). + for (next_hop, _rep) in &req.next_hops { + if next_hop.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast next hop VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", + next_hop.vni.as_u32() + ), + }); + } + + // NextHopV6.addr must be unicast (the switch endpoint for routing). + // The actual packet destination is the multicast address (req.underlay). + if next_hop.addr.is_multicast() { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "NextHopV6.addr must be unicast (switch address), got multicast: {}", + next_hop.addr + ), + }); + } + } + + // Record next hop count before consuming the vector + let next_hop_count = req.next_hops.len(); + + let token = state.management_lock.lock(); + { + let mut mcast_fwd = token.mcast_fwd.write(); + + // Get or create the next hop map for this underlay address + let next_hop_map = + mcast_fwd.entry(underlay).or_insert_with(BTreeMap::new); + + // Insert/update next hops: same next hop addr → replace replication mode, + // different next hop addr → add new entry (like `swadm route add`) + for (next_hop, rep) in req.next_hops { + next_hop_map.insert(next_hop, rep); + } + + drop(mcast_fwd); + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps(devs, underlay, &token.mcast_fwd); + } + } + + // DTrace: forwarding set + __dtrace_probe_mcast__fwd__set( + &underlay.addr(), + next_hop_count as uintptr_t, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: ClearMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + let token = state.management_lock.lock(); + { + let mut mcast_fwd = token.mcast_fwd.write(); + mcast_fwd.remove(&underlay); + drop(mcast_fwd); + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps(devs, underlay, &token.mcast_fwd); + } + } + + // DTrace: forwarding clear + __dtrace_probe_mcast__fwd__clear( + &underlay.addr(), + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn dump_mcast_forwarding_hdlr() -> Result { + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mcast_fwd = token.mcast_fwd.read(); + + let entries: Vec = mcast_fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops + .iter() + .map(|(next_hop, rep)| (*next_hop, *rep)) + .collect(), + }) + .collect(); + + Ok(DumpMcastForwardingResp { entries }) +} + +fn dump_mcast_subscriptions_hdlr() +-> Result { + let state = get_xde_state(); + let token = state.management_lock.lock(); + let devs = token.devs.read(); + + let mut entries: alloc::vec::Vec = + alloc::vec::Vec::new(); + for (underlay, ports) in devs.dump_mcast_subscriptions().into_iter() { + entries.push(McastSubscriptionEntry { underlay, ports }); + } + + Ok(DumpMcastSubscriptionsResp { entries }) +} + +#[unsafe(no_mangle)] +fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: McastSubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + // Subscriptions are keyed on the underlay (outer) IPv6 multicast address. + // If the caller supplied an overlay group, translate it via the M2P table. + // First, reject non-multicast inputs to preserve DevMap error semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + let group_key = match req.group { + oxide_vpc::api::IpAddr::Ip6(ip6) => { + // If an overlay->underlay mapping exists, use it; otherwise, if the + // provided address is already an admin-scoped multicast (ff04::/16), + // accept it as-is. Otherwise, reject. + if let Some(underlay_group) = + state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) + { + underlay_group + } else if let Ok(underlay_group) = MulticastUnderlay::new(ip6) { + underlay_group + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv6 multicast group".into(), + )); + } + } + oxide_vpc::api::IpAddr::Ip4(_v4) => { + // IPv4 overlay groups must have an M2P mapping; the subscription key + // is the underlay IPv6 multicast. Without a mapping, reject with + // a clear message (callers may rely on this distinction). + if let Some(underlay_group) = state.m2p.get(&req.group) { + underlay_group + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv4 multicast group".into(), + )); + } + } + }; + + devs.mcast_subscribe(&req.port_name, group_key)?; + + // DTrace: subscribe + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__subscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + + // Verify the port exists, maintaining consistency with other operations + // and ensures we're not silently accepting operations on non-existent + // ports. This check happens before M2P translation to provide clear + // error semantics. + if devs.get_by_name(&req.port_name).is_none() { + return Err(OpteError::PortNotFound(req.port_name.clone())); + } + + // Reject non-multicast input to preserve API use and match subscribe + // semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe, if no M2P mapping exists, we return success (no-op). + // This makes unsubscribe idempotent and handles cleanup race conditions + // where M2P mappings may be removed before unsubscribe is called. + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + }; + + devs.mcast_unsubscribe(&req.port_name, group_key)?; + // DTrace: unsubscribe + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__unsubscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_all_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeAllReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + + // Reject non-multicast input + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe-all, if no M2P mapping exists, we return success (no-op). + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + return Ok(NoResp::default()); + }; + + devs.mcast_unsubscribe_all(group_key); + // DTrace: unsubscribe-all + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + __dtrace_probe_mcast__unsubscribe__all( + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + ); + } + + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn list_layers_hdlr( env: &mut IoctlEnvelope,