eqvinox · kaminek · Jun 6, 2017 · Feb 14, 2016 · Feb 15, 2016 · Feb 15, 2016
diff --git a/Documentation/networking/bridge-subport.txt b/Documentation/networking/bridge-subport.txt
@@ -0,0 +1,154 @@
+Bridge subport & superport support
+==================================
+
+
+Subports and superports are 2 related but distinct port functionalities of the
+Linux bridge code.  They both affect how packets are replicated on a bridge,
+implementing split-horizon functionality.
+
+
+Superports
+----------
+
+A superport is a number of bridge ports that share the same integer
+"superport" value different from zero.  The value is freely configurable by
+the user, and 0 disables the function (by comparing unequal to itself, thus
+meaning "no superport").
+
+When 2 or more bridge ports are configured with the same value, this prevents
+forwarding of packets that arrive on any of these ports out onto any other of
+these ports.  This essentially groups them into a common broadcast domain.  It
+has no effect on MAC learning, STP, or anything else.
+
+This functionality is mostly useful to build a mesh on top of tunnels, e.g.
+a triangle like this:
+
+   bridge0  tunl01 ----- tunl10 bridge1
+      tunl02                   tunl12
+           \\\               ///
+             tunl20     tunl21
+                  bridge2
+
+
+While this can be made to work with STP by blocking one of the tunnels, this
+is not desirable because that would make traffic take an extra hop.  By
+putting the ports in a superport group on each of the bridges, traffic always
+flows directly to the learned destination, without creating loops by being
+forwarded back onto the mesh.
+
+The superport code does this on top of distinct devices, which could even be
+of distinct types (physical ports, tap devices, GRETAP, VXLAN, etc.)
+
+
+Subports
+--------
+
+Subports provide very similar functionality, but are built into a particular
+network device driver (or tunnel implementation).  They are intever values
+again, but this time they provide more specific data for the driver when
+sending packets.
+
+The bridge layer makes no assumptions about the meaning of the values (other
+than removing duplicates).  They cannot be configured by the user, instead
+they are provided to the bridge layer on each received packet as appropriate.
+The bridge keeps this information along its MAC learning data and provides it
+back to the driver when sending/flooding packets.
+
+This is considerably more complicated than superports, but provides a crucial
+functionality that superports cannot:  the ability to control multicast
+transmission.
+
+The idea here is that the bridge's TX path for multicast packets can pass a
+list of subport identifiers down to the device, representing which ethernet
+stations are intended to receive the particular packet.  This is particularly
+useful in two scenarios:
+
+1.) 802.11 multicast optimization
+
+   When the 802.11 TX layer knows which stations a multicast packet is
+   actually intended for, it can do the following things:
+
+   - unicast it unconditionally if there is only a single receiver
+   - clone and unicast it, with enough information supplied to an algorithm
+     that can calculate whether it is advantageous to do so
+   - multicast and pick a higher TX rate depending on the information it has
+     on the intended receivers
+
+   All of these are available in "enterprise" 802.11 solutions, yet have
+   eluded Linux wifi APs for a while;  only recently has unicast-conversion
+   made it in (though in a much less sophisticated way).
+
+
+2.) Ethernet over Multicast-capable media tunneling
+
+   Any encapsulation of Ethernet with more than 2 endpoints that is running on
+   top of a underlay network that supports multicast may be able to benefit
+   from having the extra information.
+
+   Most prominently, the under-development IETF BIER approach (it's a shim
+   header with bits controlling replication;  packets are duplicated en route
+   so that each outgoing duplicate carries a non-overlapping subset of the
+   bits) is a direct fit for this -- the subport information can directly map
+   to bits in the BIER shim.
+
+   Other options include user-configured IP multicast group mappings and MPLS
+   multicast (which is not widely used, but well).
+
+
+The common factor between these scenarios is that it is the driver that
+controls replication of multicast packets.  This is impossible to do with
+superports, since that implies multiple distinct netdevices and the bridge
+layer replicating packets.  It's too late then, when 10 netdevices get 10
+packets to transmit.
+
+
+BRANCH IMPLEMENTATION STATUS (TO BE REMOVED BEFORE PUSHING OUT)
+---------------------------------------------------------------
+(including VPLS bits)
+
+
+So, I wrote this over christmas on a hacker event where I had some time to
+play with a VPLS network ;).  Here's the status on things.
+
+General:
+
+- I've just rebased these commits;  there were a few small changes to the
+  network code in the meantime (I don't think anything broke)
+
+  I DID NOT TEST THE REBASED VERSION.  IT COMPILES BUT MAY SET YOUR COMPUTER
+  ON FIRE.
+
+MPLS layer:
+
+- the "MPT_HANDLER" thing is probably overkill, it likely makes more sense to
+  tie in the VPLS code more directly.
+
+VPLS:
+
+- I haven't implemented the control word.
+
+- I made a design mistake with the wire ID.  It's simply not needed.  A
+  pseudowire can be identified by its incoming label.  There is also some
+  really ugly code keeping an array of wires...
+
+- The genetlink interface is not needed either.  Really, the MPLS routing
+  table can carry all of the information.  You can create a "vpls0" device
+  without any pseudowires, and then add routes with the neccessary info:
+
+  "ip -f mpls route add 100 vpls vpls0 as 200 via inet 1.2.3.4 dev eth0"
+
+  This would need the addition of a new "vpls-device" netlink attribute, and
+  would "split" the information in the route between RX and TX - the RX path
+  uses the incoming label (100) and vpls-device (vpls0) information, giving
+  the packet to the proper vpls device.  The TX path can then reuse all the
+  destination information on the route -- which would make things quite a bit
+  simpler because we can use existing handling.
+
+- I only hacked in IPv4 dst info, this would be fixed by the above.
+
+bridge:
+
+- the subport code is not finished, it doesn't actually do the multicast
+  pieces.  It should hold an array of subport indexes + an array of lifetimes
+  (subport IDs need to age out like ports), and the array can be passed down
+  on TX with RCU semantics.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
@@ -732,6 +732,12 @@ struct sk_buff {
 	__u32		secmark;
 #endif
 
+	unsigned		subport_cnt;
+	union {
+		__u32		subport;
+		__u32		*subport_lst;
+	};
+
 	union {
 		__u32		mark;
 		__u32		reserved_tailroom;

diff --git a/include/net/mpls.h b/include/net/mpls.h
@@ -33,4 +33,15 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
 {
 	return (struct mpls_shim_hdr *)skb_network_header(skb);
 }
+
+struct mpls_shim_hdr;
+typedef int (*mpls_handler)(void *arg, struct sk_buff *skb,
+			    struct net_device *dev, struct packet_type *pt,
+			    struct mpls_shim_hdr *hdr,
+			    struct net_device *orig_dev);
+
+extern int mpls_handler_add(struct net *net, unsigned index,
+			    mpls_handler handler, void *handler_arg);
+extern int mpls_handler_del(struct net *net, unsigned index);
+
 #endif
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
@@ -325,6 +325,7 @@ enum {
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
+	IFLA_BRPORT_SUPERPORT,	/* superport (split-horizon) ID */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
@@ -27,6 +27,7 @@ enum {
 	NDA_MASTER,
 	NDA_LINK_NETNSID,
 	NDA_SRC_VNI,
+	NDA_SUBPORT,
 	__NDA_MAX
 };
 

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
@@ -80,6 +80,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		else
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
 	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
+		skb->subport = dst->subport;
+		skb->subport_cnt = dst->subport ? 1 : 0;
 		br_forward(dst->dst, skb, false, true);
 	} else {
 		br_flood(br, skb, BR_PKT_UNICAST, false, true);

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
@@ -29,7 +29,7 @@
 
 static struct kmem_cache *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		      const unsigned char *addr, u16 vid);
+		      __u32 subport, const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *, int);
 
@@ -278,7 +278,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 
 insert:
 	/* insert new address,  may fail if invalid address or dup. */
-	fdb_insert(br, p, newaddr, 0);
+	fdb_insert(br, p, 0, newaddr, 0);
 
 	if (!vg || !vg->num_vlans)
 		goto done;
@@ -288,7 +288,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
 	 * from under us.
 	 */
 	list_for_each_entry(v, &vg->vlan_list, vlist)
-		fdb_insert(br, p, newaddr, v->vid);
+		fdb_insert(br, p, 0, newaddr, v->vid);
 
 done:
 	spin_unlock_bh(&br->hash_lock);
@@ -307,10 +307,11 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	if (f && f->is_local && !f->dst && !f->added_by_user)
 		fdb_delete_local(br, NULL, f);
 
-	fdb_insert(br, NULL, newaddr, 0);
+	fdb_insert(br, NULL, 0, newaddr, 0);
 	vg = br_vlan_group(br);
 	if (!vg || !vg->num_vlans)
 		goto out;
+
 	/* Now remove and add entries for every VLAN configured on the
 	 * bridge.  This function runs under RTNL so the bitmap will not
 	 * change from under us.
@@ -321,7 +322,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 		f = br_fdb_find(br, br->dev->dev_addr, v->vid);
 		if (f && f->is_local && !f->dst && !f->added_by_user)
 			fdb_delete_local(br, NULL, f);
-		fdb_insert(br, NULL, newaddr, v->vid);
+		fdb_insert(br, NULL, 0, newaddr, v->vid);
 	}
 out:
 	spin_unlock_bh(&br->hash_lock);
@@ -479,6 +480,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 			fe->port_no = f->dst->port_no;
 			fe->port_hi = f->dst->port_no >> 8;
 
+			fe->unused = f->subport;
+
 			fe->is_local = f->is_local;
 			if (!f->is_static)
 				fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
@@ -495,6 +498,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
 
 static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 					       struct net_bridge_port *source,
+					       __u32 subport,
 					       const unsigned char *addr,
 					       __u16 vid,
 					       unsigned char is_local,
@@ -506,6 +510,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 	if (fdb) {
 		memcpy(fdb->addr.addr, addr, ETH_ALEN);
 		fdb->dst = source;
+		fdb->subport = subport;
 		fdb->vlan_id = vid;
 		fdb->is_local = is_local;
 		fdb->is_static = is_static;
@@ -518,7 +523,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 }
 
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
-		  const unsigned char *addr, u16 vid)
+		      __u32 subport, const unsigned char *addr, u16 vid)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
@@ -538,7 +543,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		fdb_delete(br, fdb);
 	}
 
-	fdb = fdb_create(head, source, addr, vid, 1, 1);
+	fdb = fdb_create(head, source, subport, addr, vid, 1, 1);
 	if (!fdb)
 		return -ENOMEM;
 
@@ -554,13 +559,14 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 	int ret;
 
 	spin_lock_bh(&br->hash_lock);
-	ret = fdb_insert(br, source, addr, vid);
+	ret = fdb_insert(br, source, 0, addr, vid);
 	spin_unlock_bh(&br->hash_lock);
 	return ret;
 }
 
 void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
-		   const unsigned char *addr, u16 vid, bool added_by_user)
+		   __u32 subport, const unsigned char *addr, u16 vid,
+		   bool added_by_user)
 {
 	struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
 	struct net_bridge_fdb_entry *fdb;
@@ -586,8 +592,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 			unsigned long now = jiffies;
 
 			/* fastpath: update of existing entry */
-			if (unlikely(source != fdb->dst)) {
+			if (unlikely(source != fdb->dst ||
+			    subport != fdb->subport)) {
 				fdb->dst = source;
+				fdb->subport = subport;
 				fdb_modified = true;
 				/* Take over HW learned entry */
 				if (unlikely(fdb->added_by_external_learn))
@@ -603,7 +611,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 	} else {
 		spin_lock(&br->hash_lock);
 		if (likely(!fdb_find_rcu(head, addr, vid))) {
-			fdb = fdb_create(head, source, addr, vid, 0, 0);
+			fdb = fdb_create(head, source, subport, addr, vid, 0, 0);
 			if (fdb) {
 				if (unlikely(added_by_user))
 					fdb->added_by_user = 1;
@@ -665,6 +673,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 
 	if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
 		goto nla_put_failure;
+	if (fdb->subport && nla_put(skb, NDA_SUBPORT, sizeof(u32), &fdb->subport))
+		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -791,7 +801,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
 
-		fdb = fdb_create(head, source, addr, vid, 0, 0);
+		fdb = fdb_create(head, source, 0, addr, vid, 0, 0);
 		if (!fdb)
 			return -ENOMEM;
 
@@ -854,7 +864,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 		}
 		local_bh_disable();
 		rcu_read_lock();
-		br_fdb_update(br, p, addr, vid, true);
+		br_fdb_update(br, p, 0, addr, vid, true);
 		rcu_read_unlock();
 		local_bh_enable();
 	} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
@@ -1081,7 +1091,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 	head = &br->hash[br_mac_hash(addr, vid)];
 	fdb = br_fdb_find(br, addr, vid);
 	if (!fdb) {
-		fdb = fdb_create(head, p, addr, vid, 0, 0);
+		fdb = fdb_create(head, p, 0, addr, vid, 0, 0);
 		if (!fdb) {
 			err = -ENOMEM;
 			goto err_unlock;

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
@@ -25,12 +25,14 @@
 static inline int should_deliver(const struct net_bridge_port *p,
 				 const struct sk_buff *skb)
 {
+	struct net_bridge_port *from = br_port_get_rcu(skb->dev);
 	struct net_bridge_vlan_group *vg;
 
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
 		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
-		nbp_switchdev_allowed_egress(p, skb);
+		nbp_switchdev_allowed_egress(p, skb) &&
+		(!p->superport || !from || p->superport != from->superport);
 }
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)