From 5e285775f250ae5939f760bb38a2988e1678a709 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Tue, 6 Jun 2017 23:22:16 +0200 Subject: [PATCH 1/9] Documentation: bridge sub/superports Signed-off-by: David Lamparter --- Documentation/networking/bridge-subport.txt | 102 ++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 Documentation/networking/bridge-subport.txt diff --git a/Documentation/networking/bridge-subport.txt b/Documentation/networking/bridge-subport.txt new file mode 100644 index 0000000000000..5f2803f2cebf3 --- /dev/null +++ b/Documentation/networking/bridge-subport.txt @@ -0,0 +1,102 @@ +Bridge subport & superport support +================================== + + +Subports and superports are 2 related but distinct port functionalities of the +Linux bridge code. They both affect how packets are replicated on a bridge, +implementing split-horizon functionality. + + +Superports +---------- + +A superport is a number of bridge ports that share the same integer +"superport" value different from zero. The value is freely configurable by +the user, and 0 disables the function (by comparing unequal to itself, thus +meaning "no superport"). + +When 2 or more bridge ports are configured with the same value, this prevents +forwarding of packets that arrive on any of these ports out onto any other of +these ports. This essentially groups them into a common broadcast domain. It +has no effect on MAC learning, STP, or anything else. + +This functionality is mostly useful to build a mesh on top of tunnels, e.g. +a triangle like this: + + bridge0 tunl01 ----- tunl10 bridge1 + tunl02 tunl12 + \\\ /// + tunl20 tunl21 + bridge2 + + +While this can be made to work with STP by blocking one of the tunnels, this +is not desirable because that would make traffic take an extra hop. By +putting the ports in a superport group on each of the bridges, traffic always +flows directly to the learned destination, without creating loops by being +forwarded back onto the mesh. + +The superport code does this on top of distinct devices, which could even be +of distinct types (physical ports, tap devices, GRETAP, VXLAN, etc.) + + +Subports +-------- + +Subports provide very similar functionality, but are built into a particular +network device driver (or tunnel implementation). They are intever values +again, but this time they provide more specific data for the driver when +sending packets. + +The bridge layer makes no assumptions about the meaning of the values (other +than removing duplicates). They cannot be configured by the user, instead +they are provided to the bridge layer on each received packet as appropriate. +The bridge keeps this information along its MAC learning data and provides it +back to the driver when sending/flooding packets. + +This is considerably more complicated than superports, but provides a crucial +functionality that superports cannot: the ability to control multicast +transmission. + +The idea here is that the bridge's TX path for multicast packets can pass a +list of subport identifiers down to the device, representing which ethernet +stations are intended to receive the particular packet. This is particularly +useful in two scenarios: + +1.) 802.11 multicast optimization + + When the 802.11 TX layer knows which stations a multicast packet is + actually intended for, it can do the following things: + + - unicast it unconditionally if there is only a single receiver + - clone and unicast it, with enough information supplied to an algorithm + that can calculate whether it is advantageous to do so + - multicast and pick a higher TX rate depending on the information it has + on the intended receivers + + All of these are available in "enterprise" 802.11 solutions, yet have + eluded Linux wifi APs for a while; only recently has unicast-conversion + made it in (though in a much less sophisticated way). + + +2.) Ethernet over Multicast-capable media tunneling + + Any encapsulation of Ethernet with more than 2 endpoints that is running on + top of a underlay network that supports multicast may be able to benefit + from having the extra information. + + Most prominently, the under-development IETF BIER approach (it's a shim + header with bits controlling replication; packets are duplicated en route + so that each outgoing duplicate carries a non-overlapping subset of the + bits) is a direct fit for this -- the subport information can directly map + to bits in the BIER shim. + + Other options include user-configured IP multicast group mappings and MPLS + multicast (which is not widely used, but well). + + +The common factor between these scenarios is that it is the driver that +controls replication of multicast packets. This is impossible to do with +superports, since that implies multiple distinct netdevices and the bridge +layer replicating packets. It's too late then, when 10 netdevices get 10 +packets to transmit. From a4c57d3dd8fb490cad1f6a9b057f95d87cde3ec4 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Mon, 15 Feb 2016 00:52:32 +0100 Subject: [PATCH 2/9] bridge: add "superport" split-horizon concept This adds device-agnostic support for split horizon bridging, which is a fancy name for saying "multiple member ports are treated as one for loop avoidance." Signed-off-by: David Lamparter --- include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 4 +++- net/bridge/br_netlink.c | 9 ++++++++- net/bridge/br_private.h | 2 ++ net/bridge/br_sysfs_if.c | 13 +++++++++++++ 5 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8ed679fe603fc..bf9b260b69ae9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -325,6 +325,7 @@ enum { IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_SUPERPORT, /* superport (split-horizon) ID */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 48fb17417fac3..12620f5117fb2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,12 +25,14 @@ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { + struct net_bridge_port *from = br_port_get_rcu(skb->dev); struct net_bridge_vlan_group *vg; vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && - nbp_switchdev_allowed_egress(p, skb); + nbp_switchdev_allowed_egress(p, skb) && + (!p->superport || !from || p->superport != from->superport); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 1e63ec466d7c7..13682726c947a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + + nla_total_size(4) /* IFLA_BRPORT_SUPERPORT */ + 0; } @@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & - BR_VLAN_TUNNEL))) + BR_VLAN_TUNNEL)) || + nla_put_u32(skb, IFLA_BRPORT_SUPERPORT, p->superport)) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_SUPERPORT] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -728,6 +731,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } #endif + + if (tb[IFLA_BRPORT_SUPERPORT]) + p->superport = nla_get_u32(tb[IFLA_BRPORT_SUPERPORT]); + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 20626927f4336..43d968e1e565f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -244,6 +244,8 @@ struct net_bridge_port { struct kobject kobj; struct rcu_head rcu; + u32 superport; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 5d5d413a6cf8a..293b95d167367 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -175,6 +175,18 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); +static ssize_t show_superport(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%u\n", p->superport); +} +static int br_set_superport(struct net_bridge_port *p, unsigned long val) +{ + p->superport = val; + return 0; +} +static BRPORT_ATTR(superport, S_IRUGO | S_IWUSR, + show_superport, br_set_superport); + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { @@ -214,6 +226,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_root_block, &brport_attr_learning, &brport_attr_unicast_flood, + &brport_attr_superport, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, From 4e3d56478e104cd3acdf63143abac08ba28f7451 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Mon, 15 Feb 2016 16:22:58 +0100 Subject: [PATCH 3/9] bridge: add "subport" concept This implements holding subport information in the bridge layer, but only for unicast entries in the MAC table. Multicast is still left to implement. Signed-off-by: David Lamparter --- include/linux/skbuff.h | 6 ++++++ include/uapi/linux/neighbour.h | 1 + net/bridge/br_device.c | 2 ++ net/bridge/br_fdb.c | 38 +++++++++++++++++++++------------- net/bridge/br_input.c | 11 ++++++++-- net/bridge/br_private.h | 4 +++- 6 files changed, 45 insertions(+), 17 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d460a4cbda1c8..8ca275d860edf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -732,6 +732,12 @@ struct sk_buff { __u32 secmark; #endif + unsigned subport_cnt; + union { + __u32 subport; + __u32 *subport_lst; + }; + union { __u32 mark; __u32 reserved_tailroom; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index f3d16dbe09d64..8f174c6e0f2d0 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -27,6 +27,7 @@ enum { NDA_MASTER, NDA_LINK_NETNSID, NDA_SRC_VNI, + NDA_SUBPORT, __NDA_MAX }; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 430b53e7d941d..5fef3213972d7 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -80,6 +80,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) else br_flood(br, skb, BR_PKT_MULTICAST, false, true); } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { + skb->subport = dst->subport; + skb->subport_cnt = dst->subport ? 1 : 0; br_forward(dst->dst, skb, false, true); } else { br_flood(br, skb, BR_PKT_UNICAST, false, true); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ab0c7cc8448f4..034183875d5fd 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -29,7 +29,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid); + __u32 subport, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *, int); @@ -278,7 +278,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) insert: /* insert new address, may fail if invalid address or dup. */ - fdb_insert(br, p, newaddr, 0); + fdb_insert(br, p, 0, newaddr, 0); if (!vg || !vg->num_vlans) goto done; @@ -288,7 +288,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) * from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) - fdb_insert(br, p, newaddr, v->vid); + fdb_insert(br, p, 0, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); @@ -307,10 +307,11 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, 0); + fdb_insert(br, NULL, 0, newaddr, 0); vg = br_vlan_group(br); if (!vg || !vg->num_vlans) goto out; + /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. @@ -321,7 +322,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) f = br_fdb_find(br, br->dev->dev_addr, v->vid); if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, v->vid); + fdb_insert(br, NULL, 0, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); @@ -479,6 +480,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; + fe->unused = f->subport; + fe->is_local = f->is_local; if (!f->is_static) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); @@ -495,6 +498,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, struct net_bridge_port *source, + __u32 subport, const unsigned char *addr, __u16 vid, unsigned char is_local, @@ -506,6 +510,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, if (fdb) { memcpy(fdb->addr.addr, addr, ETH_ALEN); fdb->dst = source; + fdb->subport = subport; fdb->vlan_id = vid; fdb->is_local = is_local; fdb->is_static = is_static; @@ -518,7 +523,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, } static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid) + __u32 subport, const unsigned char *addr, u16 vid) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; @@ -538,7 +543,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, fdb_delete(br, fdb); } - fdb = fdb_create(head, source, addr, vid, 1, 1); + fdb = fdb_create(head, source, subport, addr, vid, 1, 1); if (!fdb) return -ENOMEM; @@ -554,13 +559,14 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, int ret; spin_lock_bh(&br->hash_lock); - ret = fdb_insert(br, source, addr, vid); + ret = fdb_insert(br, source, 0, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user) + __u32 subport, const unsigned char *addr, u16 vid, + bool added_by_user) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; @@ -586,8 +592,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst)) { + if (unlikely(source != fdb->dst || + subport != fdb->subport)) { fdb->dst = source; + fdb->subport = subport; fdb_modified = true; /* Take over HW learned entry */ if (unlikely(fdb->added_by_external_learn)) @@ -603,7 +611,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } else { spin_lock(&br->hash_lock); if (likely(!fdb_find_rcu(head, addr, vid))) { - fdb = fdb_create(head, source, addr, vid, 0, 0); + fdb = fdb_create(head, source, subport, addr, vid, 0, 0); if (fdb) { if (unlikely(added_by_user)) fdb->added_by_user = 1; @@ -665,6 +673,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) goto nla_put_failure; + if (fdb->subport && nla_put(skb, NDA_SUBPORT, sizeof(u32), &fdb->subport)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -791,7 +801,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (!(flags & NLM_F_CREATE)) return -ENOENT; - fdb = fdb_create(head, source, addr, vid, 0, 0); + fdb = fdb_create(head, source, 0, addr, vid, 0, 0); if (!fdb) return -ENOMEM; @@ -854,7 +864,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, } local_bh_disable(); rcu_read_lock(); - br_fdb_update(br, p, addr, vid, true); + br_fdb_update(br, p, 0, addr, vid, true); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { @@ -1081,7 +1091,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, head = &br->hash[br_mac_hash(addr, vid)]; fdb = br_fdb_find(br, addr, vid); if (!fdb) { - fdb = fdb_create(head, p, addr, vid, 0, 0); + fdb = fdb_create(head, p, 0, addr, vid, 0, 0); if (!fdb) { err = -ENOMEM; goto err_unlock; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 013f2290bfa56..34efe2735592f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -127,6 +127,11 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } } +static __u32 skb_subport(struct sk_buff *skb) +{ + return skb->subport_cnt == 1 ? skb->subport : 0; +} + /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -150,7 +155,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(br, p, skb_subport(skb), eth_hdr(skb)->h_source, + vid, false); local_rcv = !!(br->dev->flags & IFF_PROMISC); if (is_multicast_ether_addr(dest)) { @@ -229,7 +235,8 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* check if vlan is allowed, to avoid spoofing */ if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); + br_fdb_update(p->br, p, skb_subport(skb), + eth_hdr(skb)->h_source, vid, false); } /* note: already called with rcu_read_lock */ diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 43d968e1e565f..987e60e24fb5c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -163,6 +163,7 @@ struct net_bridge_vlan_group { struct net_bridge_fdb_entry { struct hlist_node hlist; struct net_bridge_port *dst; + __u32 subport; mac_addr addr; __u16 vlan_id; @@ -518,7 +519,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user); + __u32 subport, const unsigned char *addr, u16 vid, + bool added_by_user); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); From 36d4d7825aa3b5fffc0b41a5cb850504113488bd Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Mon, 15 Feb 2016 16:24:07 +0100 Subject: [PATCH 4/9] mpls: add "handlers" [work-in-progress, likely needs changes] Signed-off-by: David Lamparter --- include/net/mpls.h | 11 ++++++ net/mpls/af_mpls.c | 84 +++++++++++++++++++++++++++++++++++++++++++++ net/mpls/internal.h | 3 ++ 3 files changed, 98 insertions(+) diff --git a/include/net/mpls.h b/include/net/mpls.h index 1dbc669b770e8..57f055629f121 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -33,4 +33,15 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) { return (struct mpls_shim_hdr *)skb_network_header(skb); } + +struct mpls_shim_hdr; +typedef int (*mpls_handler)(void *arg, struct sk_buff *skb, + struct net_device *dev, struct packet_type *pt, + struct mpls_shim_hdr *hdr, + struct net_device *orig_dev); + +extern int mpls_handler_add(struct net *net, unsigned index, + mpls_handler handler, void *handler_arg); +extern int mpls_handler_del(struct net *net, unsigned index); + #endif diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 94b3317232a65..46ad81968b01a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -299,6 +300,7 @@ static bool mpls_egress(struct net *net, struct mpls_route *rt, success = true; break; } + case MPT_HANDLER: case MPT_UNSPEC: /* Should have decided which protocol it is by now */ break; @@ -356,6 +358,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, goto drop; } + if (rt->rt_payload_type == MPT_HANDLER) + return rt->rt_handler(rt->rt_harg, skb, dev, pt, hdr, + orig_dev); + nh = mpls_select_multipath(rt, skb); if (!nh) goto err; @@ -947,6 +953,16 @@ static int mpls_route_add(struct mpls_route_config *cfg, if (!mpls_label_ok(net, index, extack)) goto errout; + switch (cfg->rc_payload_type) { + case MPT_UNSPEC: + case MPT_IPV4: + case MPT_IPV6: + break; + default: + err = -EINVAL; + goto errout; + } + /* Append makes no sense with mpls */ err = -EOPNOTSUPP; if (cfg->rc_nlflags & NLM_F_APPEND) { @@ -1271,6 +1287,74 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, return skb->len; } +int mpls_handler_add(struct net *net, unsigned index, mpls_handler handler, + void *handler_arg) +{ + struct mpls_route __rcu **platform_label; + struct mpls_route *rt, *old; + int err = -EINVAL; + + /* Reserved labels may not be set */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) + goto errout; + + /* The full 20 bit range may not be supported. */ + if (index >= net->mpls.platform_labels) + goto errout; + + err = -EEXIST; + platform_label = rtnl_dereference(net->mpls.platform_label); + old = rtnl_dereference(platform_label[index]); + if (old) + goto errout; + + err = -ENOMEM; + rt = mpls_rt_alloc(0, 0, 0); + if (!rt) + goto errout; + + rt->rt_protocol = RTPROT_KERNEL; + rt->rt_payload_type = MPT_HANDLER; + rt->rt_handler = handler; + rt->rt_harg = handler_arg; + + mpls_route_update(net, index, rt, NULL); + return 0; + +errout: + return err; +} +EXPORT_SYMBOL(mpls_handler_add); + +int mpls_handler_del(struct net *net, unsigned index) +{ + struct mpls_route __rcu **platform_label; + struct mpls_route *old; + int err = -EINVAL; + + /* Reserved labels may not be removed */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) + goto errout; + + /* The full 20 bit range may not be supported */ + if (index >= net->mpls.platform_labels) + goto errout; + + platform_label = rtnl_dereference(net->mpls.platform_label); + old = rtnl_dereference(platform_label[index]); + if (!old) + goto errout; + if (old->rt_payload_type != MPT_HANDLER) + goto errout; + + mpls_route_update(net, index, NULL, NULL); + + err = 0; +errout: + return err; +} +EXPORT_SYMBOL(mpls_handler_del); + #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index cf65aec2e551b..2cd73eb514637 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -78,6 +78,7 @@ enum mpls_payload_type { MPT_UNSPEC, /* IPv4 or IPv6 */ MPT_IPV4 = 4, MPT_IPV6 = 6, + MPT_HANDLER = 255, /* Other types not implemented: * - Pseudo-wire with or without control word (RFC4385) @@ -141,6 +142,8 @@ enum mpls_ttl_propagation { */ struct mpls_route { /* next hop label forwarding entry */ struct rcu_head rt_rcu; + mpls_handler rt_handler; + void *rt_harg; u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; From fbdf498224cfd8888f39183812a6c41c71cc86b7 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Mon, 15 Feb 2016 16:24:36 +0100 Subject: [PATCH 5/9] mpls: VPLS support [work-in-progress, works but needs changes] Signed-off-by: David Lamparter --- net/mpls/Kconfig | 5 + net/mpls/Makefile | 1 + net/mpls/vpls.c | 750 ++++++++++++++++++++++++++++++++++++++++++++++ net/mpls/vpls.h | 27 ++ 4 files changed, 783 insertions(+) create mode 100644 net/mpls/vpls.c create mode 100644 net/mpls/vpls.h diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 5c467ef973114..42381a3acae81 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -33,4 +33,9 @@ config MPLS_IPTUNNEL ---help--- mpls ip tunnel support. +config MPLS_VPLS + tristate "VPLS support" + ---help--- + Add support for de-&encapsulating VPLS. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 9ca9236250165..d9e6326664171 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o +obj-$(CONFIG_MPLS_VPLS) += vpls.o mpls_router-y := af_mpls.o diff --git a/net/mpls/vpls.c b/net/mpls/vpls.c new file mode 100644 index 0000000000000..8dcf7695d0198 --- /dev/null +++ b/net/mpls/vpls.c @@ -0,0 +1,750 @@ +/* + * net/mpls/vpls.c + * + * Copyright (C) 2016 David Lamparter + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "vpls.h" + +#define DRV_NAME "vpls" + +#define MIN_MTU 68 /* Min L3 MTU */ +#define MAX_MTU 65535 /* Max L3 MTU (arbitrary) */ + +#define MAXWIRES 256 + +struct vpls_dst { + struct net_device *dev; + unsigned label_in, label_out; + __be32 addr; +}; + +struct vpls_dst_list { + size_t count; + struct vpls_dst *items; +}; + +struct vpls_priv { + struct net *encap_net; + struct vpls_dst_list *dsts; +}; + +static int vpls_xmit_dst(struct sk_buff *skb, struct vpls_priv *vpls, + struct vpls_dst *dst) +{ + unsigned int hh_len; + unsigned int new_header_size; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev = dst->dev; + int err; + + if (!mpls_output_possible(dst->dev) || skb_warn_if_lro(skb)) + return -1; + + new_header_size = 1 * sizeof(struct mpls_shim_hdr); + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + if (skb_cow(skb, hh_len + new_header_size)) + return -1; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + hdr = mpls_hdr(skb); + hdr[0] = mpls_entry_encode(dst->label_out, 255, 0, true); + + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &dst->addr, skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return err; +} + +static netdev_tx_t vpls_xmit(struct sk_buff *skb, struct net_device *dev) +{ + int ret = -EINVAL; + struct vpls_priv *priv = netdev_priv(dev); + struct vpls_dst_list *dsts; + struct sk_buff *cloned; + size_t i; + + skb_orphan(skb); + skb_forward_csum(skb); + + rcu_read_lock(); + + dsts = rcu_dereference(priv->dsts); + if (!dsts->count) + goto drop; + + if (skb->subport_cnt == 1 && skb->subport < dsts->count + && dsts->items[skb->subport].dev) { + + i = skb->subport; + + cloned = skb_clone(skb, GFP_KERNEL); + if (vpls_xmit_dst(cloned, priv, &dsts->items[i])) + consume_skb(cloned); + + } else { + for (i = 0; i < dsts->count; i++) + if (dsts->items[i].dev) { + cloned = skb_clone(skb, GFP_KERNEL); + if (vpls_xmit_dst(cloned, priv, &dsts->items[i])) + consume_skb(cloned); + } + } + + ret = 0; +drop: + rcu_read_unlock(); + consume_skb(skb); + return ret; +} + +static int vpls_rcv(void *arg, struct sk_buff *skb, struct net_device *in_dev, + struct packet_type *pt, struct mpls_shim_hdr *hdr, + struct net_device *orig_dev) +{ + struct net_device *dev = arg; + struct vpls_priv *priv = netdev_priv(dev); + struct mpls_entry_decoded dec; + struct vpls_dst_list *dsts; + size_t i; + + dec = mpls_entry_decode(hdr); + if (!dec.bos) { + pr_info("%s: incoming BoS mismatch\n", dev->name); + goto drop; + } + + rcu_read_lock(); + dsts = rcu_dereference(priv->dsts); + for (i = 0; i < dsts->count; i++) + if (dsts->items[i].dev && dec.label == dsts->items[i].label_in) + break; + + if (i == dsts->count) { + pr_info("%s: incoming label %u not found\n", dev->name, + dec.label); + rcu_read_unlock(); + goto drop; + } + rcu_read_unlock(); + + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + goto drop; + + skb->dev = dev; + + skb_reset_mac_header(skb); + skb->protocol = eth_type_trans(skb, dev); + skb->ip_summed = CHECKSUM_NONE; + skb->pkt_type = PACKET_HOST; + + skb_clear_hash(skb); + skb->vlan_tci = 0; + skb_set_queue_mapping(skb, 0); + skb_scrub_packet(skb, !net_eq(dev_net(in_dev), dev_net(dev))); + + skb_reset_network_header(skb); + skb_probe_transport_header(skb, 0); + + skb->subport_cnt = 1; + skb->subport = i; + + netif_rx(skb); + return 0; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* fake multicast ability */ +static void vpls_set_multicast_list(struct net_device *dev) +{ +} + +static int vpls_open(struct net_device *dev) +{ + struct vpls_priv *priv = netdev_priv(dev); + struct vpls_dst_list *dsts; + int rc; + size_t i; + + rcu_read_lock(); + dsts = rcu_dereference(priv->dsts); + for (i = 0; i < dsts->count; i++) + if (dsts->items[i].dev) { + struct vpls_dst *dst = &dsts->items[i]; + rc = mpls_handler_add(priv->encap_net, dst->label_in, + vpls_rcv, dev); + } + rcu_read_unlock(); + + netif_carrier_on(dev); + return 0; +} + +static int vpls_close(struct net_device *dev) +{ + struct vpls_priv *priv = netdev_priv(dev); + struct vpls_dst_list *dsts; + size_t i; + + netif_carrier_off(dev); + + rcu_read_lock(); + dsts = rcu_dereference(priv->dsts); + for (i = 0; i < dsts->count; i++) + if (dsts->items[i].dev) { + struct vpls_dst *dst = &dsts->items[i]; + mpls_handler_del(priv->encap_net, dst->label_in); + } + rcu_read_unlock(); + return 0; +} + +static int is_valid_vpls_mtu(int new_mtu) +{ + return new_mtu >= MIN_MTU && new_mtu <= MAX_MTU; +} + +static int vpls_change_mtu(struct net_device *dev, int new_mtu) +{ + if (!is_valid_vpls_mtu(new_mtu)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static int vpls_dev_init(struct net_device *dev) +{ + struct vpls_priv *priv = netdev_priv(dev); + priv->dsts = kzalloc(sizeof(struct vpls_dst_list), GFP_KERNEL); + + return 0; +} + +static void vpls_dev_free(struct net_device *dev) +{ + struct vpls_priv *priv = netdev_priv(dev); + struct vpls_dst_list *dsts; + size_t i; + + dsts = priv->dsts; + for (i = 0; i < dsts->count; i++) + if (dsts->items[i].dev) + dev_put(dsts->items[i].dev); + if (priv->dsts->items) + kfree(priv->dsts->items); + kfree(priv->dsts); + + if (priv->encap_net) + put_net(priv->encap_net); + + free_netdev(dev); +} + +static const struct net_device_ops vpls_netdev_ops = { + .ndo_init = vpls_dev_init, + .ndo_open = vpls_open, + .ndo_stop = vpls_close, + .ndo_start_xmit = vpls_xmit, + .ndo_change_mtu = vpls_change_mtu, + .ndo_set_rx_mode = vpls_set_multicast_list, + .ndo_set_mac_address = eth_mac_addr, + .ndo_features_check = passthru_features_check, +}; + +#define VPLS_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | \ + NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_HIGHDMA) + +static void vpls_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->netdev_ops = &vpls_netdev_ops; + dev->features |= NETIF_F_LLTX; + dev->features |= VPLS_FEATURES; + dev->vlan_features = dev->features; + dev->destructor = vpls_dev_free; + + dev->hw_features = VPLS_FEATURES; + dev->hw_enc_features = VPLS_FEATURES; +} + +/* + * netlink interface + */ + +static int vpls_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + if (tb[IFLA_MTU]) { + if (!is_valid_vpls_mtu(nla_get_u32(tb[IFLA_MTU]))) + return -EINVAL; + } + return 0; +} + +static struct rtnl_link_ops vpls_link_ops; + +static int vpls_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + int err; + struct vpls_priv *priv = netdev_priv(dev); + + if (tb[IFLA_ADDRESS] == NULL) + eth_hw_addr_random(dev); + + if (tb[IFLA_IFNAME]) + nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + else + snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); + + err = register_netdevice(dev); + if (err < 0) + goto err; + priv->encap_net = get_net(src_net); + + netif_carrier_off(dev); + return 0; + +err: + return err; +} + +static void vpls_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + + +static struct rtnl_link_ops vpls_link_ops = { + .kind = DRV_NAME, + .priv_size = sizeof(struct vpls_priv), + .setup = vpls_setup, + .validate = vpls_validate, + .newlink = vpls_newlink, + .dellink = vpls_dellink, +}; + +/* + * GENL wire-control interface + */ + +static struct nla_policy vpls_genl_policy[VPLS_ATTR_MAX + 1] = { + [VPLS_ATTR_IFINDEX] = { .type = NLA_U32 }, + [VPLS_ATTR_WIREID] = { .type = NLA_U32 }, + [VPLS_ATTR_LABEL_IN] = { .type = NLA_U32 }, + [VPLS_ATTR_LABEL_OUT] = { .type = NLA_U32 }, + [VPLS_ATTR_NH_DEV] = { .type = NLA_U32 }, + [VPLS_ATTR_NH_IP] = { .type = NLA_U32 }, +}; + +static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info); +static int vpls_genl_getwire(struct sk_buff *skb, struct genl_info *info); +static int vpls_genl_dumpwire(struct sk_buff *skb, struct netlink_callback *cb); +static int vpls_genl_delwire(struct sk_buff *skb, struct genl_info *info); + +static struct genl_ops vpls_genl_ops[] = { + { + .cmd = VPLS_CMD_NEWWIRE, + .flags = GENL_ADMIN_PERM, + .policy = vpls_genl_policy, + .doit = vpls_genl_newwire, + .dumpit = NULL, + }, + { + .cmd = VPLS_CMD_GETWIRE, + .flags = GENL_ADMIN_PERM, + .policy = vpls_genl_policy, + .doit = vpls_genl_getwire, + .dumpit = vpls_genl_dumpwire, + }, + { + .cmd = VPLS_CMD_DELWIRE, + .flags = GENL_ADMIN_PERM, + .policy = vpls_genl_policy, + .doit = vpls_genl_delwire, + .dumpit = NULL, + }, +}; + +struct genl_multicast_group vpls_genl_groups[] = { + { + .name = "newwire", + }, +}; + +static struct genl_family vpls_genl_family = { + .hdrsize = 0, + .name = "vpls", + .version = 1, + .maxattr = VPLS_ATTR_MAX, + + .ops = vpls_genl_ops, + .n_ops = ARRAY_SIZE(vpls_genl_ops), + .mcgrps = vpls_genl_groups, + .n_mcgrps = ARRAY_SIZE(vpls_genl_groups), +}; + + +static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **data = info->attrs; + struct net *net = sock_net(skb->sk); + int ret = -EINVAL; + struct net_device *vplsdev, *outdev; + struct vpls_priv *priv; + struct vpls_dst_list *dsts, *newdsts; + u32 wireid; + size_t count; + unsigned remove_lbl = 0; + + if (!data[VPLS_ATTR_WIREID] || !data[VPLS_ATTR_IFINDEX]) + return -EINVAL; + if (!data[VPLS_ATTR_NH_DEV] || !data[VPLS_ATTR_NH_IP]) + return -EINVAL; + if (!data[VPLS_ATTR_LABEL_OUT] || !data[VPLS_ATTR_LABEL_IN]) + return -EINVAL; + + wireid = nla_get_u32(data[VPLS_ATTR_WIREID]); + if (wireid >= MAXWIRES) + return -EINVAL; + + rtnl_lock(); + + vplsdev = __dev_get_by_index(net, nla_get_u32(data[VPLS_ATTR_IFINDEX])); + if (!vplsdev || vplsdev->netdev_ops != &vpls_netdev_ops) + goto out_unlock; + + outdev = dev_get_by_index(net, nla_get_u32(data[VPLS_ATTR_NH_DEV])); + if (!outdev) + goto out_unlock; + + priv = netdev_priv(vplsdev); + dsts = priv->dsts; + count = dsts->count; + if (wireid < count && dsts->items[wireid].dev) { + if ((info->nlhdr->nlmsg_flags & (NLM_F_EXCL + | NLM_F_REPLACE)) != NLM_F_REPLACE) { + ret = -EEXIST; + goto out_drop_outdev; + } + remove_lbl = dsts->items[wireid].label_in; + } else { + if (!(info->nlhdr->nlmsg_flags & NLM_F_CREATE)) { + ret = -ENOENT; + goto out_drop_outdev; + } + if (wireid >= count) + count = wireid + 1; + } + newdsts = kzalloc(sizeof(struct vpls_dst_list), GFP_KERNEL); + if (!newdsts) { + ret = -ENOMEM; + goto out_drop_outdev; + } + newdsts->count = count; + newdsts->items = kzalloc(sizeof(newdsts->items[0]) * newdsts->count, GFP_KERNEL); + memcpy(newdsts->items, dsts->items, sizeof(dsts->items[0]) * dsts->count); + + if (newdsts->items[wireid].dev) + dev_put(newdsts->items[wireid].dev); + newdsts->items[wireid].label_in = nla_get_u32(data[VPLS_ATTR_LABEL_IN]); + newdsts->items[wireid].label_out = nla_get_u32(data[VPLS_ATTR_LABEL_OUT]); + newdsts->items[wireid].dev = outdev; + newdsts->items[wireid].addr = nla_get_u32(data[VPLS_ATTR_NH_IP]); + + if (remove_lbl && remove_lbl != newdsts->items[wireid].label_in) + mpls_handler_del(priv->encap_net, remove_lbl); + + if (remove_lbl != newdsts->items[wireid].label_in) + ret = mpls_handler_add(priv->encap_net, + newdsts->items[wireid].label_in, + vpls_rcv, vplsdev); + + rcu_assign_pointer(priv->dsts, newdsts); + rtnl_unlock(); + + synchronize_rcu(); + + kfree(dsts->items); + kfree(dsts); + + return 0; + +out_drop_outdev: + dev_put(outdev); +out_unlock: + rtnl_unlock(); + return ret; +} + +static int vpls_genl_delwire(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **data = info->attrs; + struct net *net = sock_net(skb->sk); + int ret = -EINVAL; + struct net_device *vplsdev; + struct vpls_priv *priv; + struct vpls_dst_list *dsts, *newdsts; + u32 wireid; + size_t count; + + if (!data[VPLS_ATTR_WIREID] || !data[VPLS_ATTR_IFINDEX]) + return -EINVAL; + + wireid = nla_get_u32(data[VPLS_ATTR_WIREID]); + if (wireid >= MAXWIRES) + return -EINVAL; + + rtnl_lock(); + + vplsdev = __dev_get_by_index(net, nla_get_u32(data[VPLS_ATTR_IFINDEX])); + if (!vplsdev || vplsdev->netdev_ops != &vpls_netdev_ops) + goto out_unlock; + priv = netdev_priv(vplsdev); + + dsts = priv->dsts; + count = dsts->count; + if (wireid >= count || !dsts->items[wireid].dev) { + ret = -ENOENT; + goto out_unlock; + } + + mpls_handler_del(priv->encap_net, dsts->items[wireid].label_in); + + if (wireid + 1 == count) + for (count--; count && !dsts->items[count - 1].dev; count--) + ; + + newdsts->count = count; + newdsts->items = kzalloc(sizeof(newdsts->items[0]) * count, GFP_KERNEL); + memcpy(newdsts->items, dsts->items, sizeof(dsts->items[0]) * count); + if (wireid < count) + memset(&newdsts->items[wireid], 0, sizeof(newdsts->items[0])); + + rcu_assign_pointer(priv->dsts, newdsts); + rtnl_unlock(); + + synchronize_rcu(); + + kfree(dsts->items); + kfree(dsts); + + return 0; + +out_unlock: + rtnl_unlock(); + return ret; +} + +static int vpls_nl_wire_msg(struct sk_buff *msg, struct net_device *dev, + int cmd, unsigned wireid, struct vpls_dst *dst, + u32 portid, u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &vpls_genl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(msg, VPLS_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + if (nla_put_u32(msg, VPLS_ATTR_WIREID, wireid)) + goto nla_put_failure; + if (nla_put_u32(msg, VPLS_ATTR_NH_DEV, dst->dev->ifindex)) + goto nla_put_failure; + if (nla_put_u32(msg, VPLS_ATTR_NH_IP, dst->addr)) + goto nla_put_failure; + if (nla_put_u32(msg, VPLS_ATTR_LABEL_IN, dst->label_in)) + goto nla_put_failure; + if (nla_put_u32(msg, VPLS_ATTR_LABEL_OUT, dst->label_out)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int vpls_genl_getwire(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **data = info->attrs; + struct net *net = sock_net(skb->sk); + int ret = -EINVAL; + struct net_device *vplsdev; + struct vpls_priv *priv; + struct vpls_dst_list *dsts; + u32 wireid; + struct sk_buff *msg; + + if (!data[VPLS_ATTR_WIREID] || !data[VPLS_ATTR_IFINDEX]) + return -EINVAL; + + wireid = nla_get_u32(data[VPLS_ATTR_WIREID]); + if (wireid >= MAXWIRES) + return -EINVAL; + + rtnl_lock(); + + vplsdev = __dev_get_by_index(net, nla_get_u32(data[VPLS_ATTR_IFINDEX])); + if (!vplsdev || vplsdev->netdev_ops != &vpls_netdev_ops) + goto out_unlock; + + priv = netdev_priv(vplsdev); + dsts = priv->dsts; + + if (wireid >= dsts->count || !dsts->items[wireid].dev) { + ret = -ENOENT; + goto out_unlock; + } else { + ret = -ENOMEM; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto out_unlock; + ret = vpls_nl_wire_msg(msg, vplsdev, VPLS_CMD_NEWWIRE, wireid, + &dsts->items[wireid], info->snd_portid, info->snd_seq, + 0); + if (ret) + goto out_unlock; + + ret = genlmsg_reply(msg, info); + } + + rtnl_unlock(); + return 0; + +out_unlock: + rtnl_unlock(); + return ret; +} + +static int vpls_genl_dumpwire(struct sk_buff *skb, struct netlink_callback *cb) +{ + int ret; + struct nlattr *attrs[VPLS_ATTR_MAX+1]; + unsigned ifindex; + struct net *net = sock_net(skb->sk); + struct net_device *vplsdev; + struct vpls_priv *priv; + struct vpls_dst_list *dsts; + u32 wireid; + + if (!cb->args[0]) { + ret = nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + ARRAY_SIZE(attrs), vpls_genl_policy, NULL); + if (ret) + return ret; + if (!attrs[VPLS_ATTR_IFINDEX]) + return -EINVAL; + ifindex = cb->args[0] = nla_get_u32(attrs[VPLS_ATTR_IFINDEX]); + } else { + ifindex = cb->args[0]; + } + + rtnl_lock(); + + ret = -ENODEV; + vplsdev = __dev_get_by_index(net, ifindex); + if (!vplsdev || vplsdev->netdev_ops != &vpls_netdev_ops) + goto out_unlock; + + priv = netdev_priv(vplsdev); + dsts = priv->dsts; + + wireid = cb->args[1]; + for (wireid = cb->args[1]; wireid < dsts->count; wireid++) + if (dsts->items[wireid].dev) + break; + ret = 0; + if (wireid == dsts->count) + goto out_unlock; + cb->args[1] = wireid + 1; + + ret = vpls_nl_wire_msg(skb, vplsdev, VPLS_CMD_NEWWIRE, wireid, + &dsts->items[wireid], + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (ret == 0) + ret = skb->len; + +out_unlock: + rtnl_unlock(); + return ret; +} + +/* + * init/fini + */ + +static __init int vpls_init(void) +{ + int ret; + + ret = genl_register_family(&vpls_genl_family); + if (ret) + return ret; + + ret = rtnl_link_register(&vpls_link_ops); + if (ret) + goto out_unreg_family; + + return 0; + +out_unreg_family: + genl_unregister_family(&vpls_genl_family); + return ret; +} + +static __exit void vpls_exit(void) +{ + genl_unregister_family(&vpls_genl_family); + rtnl_link_unregister(&vpls_link_ops); +} + +module_init(vpls_init); +module_exit(vpls_exit); + +MODULE_DESCRIPTION("Virtual Private LAN Service"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/net/mpls/vpls.h b/net/mpls/vpls.h new file mode 100644 index 0000000000000..72211e1ba7cf2 --- /dev/null +++ b/net/mpls/vpls.h @@ -0,0 +1,27 @@ +#ifndef _VPLS_H +#define _VPLS_H + +enum { + VPLS_ATTR_UNSPEC = 0, + VPLS_ATTR_IFINDEX, + VPLS_ATTR_WIREID, + VPLS_ATTR_LABEL_IN, + VPLS_ATTR_LABEL_OUT, + VPLS_ATTR_NH_DEV, + VPLS_ATTR_NH_IP, + __VPLS_ATTR_MAX, +}; +#define VPLS_ATTR_MAX (__VPLS_ATTR_MAX - 1) + +enum { + VPLS_CMD_UNSPEC = 0, + + VPLS_CMD_NEWWIRE = 4, + VPLS_CMD_DELWIRE, + VPLS_CMD_GETWIRE, + VPLS_CMD_SETWIRE, + __VPLS_CMD_MAX, +}; +#define VPLS_CMD_MAX (__VPLS_CMD_MAX - 1) + +#endif /* _VPLS_H */ From 81c809d6f9c40c0332098e13fcad65144aa51795 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Tue, 6 Jun 2017 23:36:39 +0200 Subject: [PATCH 6/9] work in progress state --- Documentation/networking/bridge-subport.txt | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/Documentation/networking/bridge-subport.txt b/Documentation/networking/bridge-subport.txt index 5f2803f2cebf3..02b0c0deeced9 100644 --- a/Documentation/networking/bridge-subport.txt +++ b/Documentation/networking/bridge-subport.txt @@ -100,3 +100,55 @@ controls replication of multicast packets. This is impossible to do with superports, since that implies multiple distinct netdevices and the bridge layer replicating packets. It's too late then, when 10 netdevices get 10 packets to transmit. + + +BRANCH IMPLEMENTATION STATUS (TO BE REMOVED BEFORE PUSHING OUT) +--------------------------------------------------------------- +(including VPLS bits) + + +So, I wrote this over christmas on a hacker event where I had some time to +play with a VPLS network ;). Here's the status on things. + +General: + +- I've just rebased these commits; there were a few small changes to the + network code in the meantime (I don't think anything broke) + + I DID NOT TEST THE REBASED VERSION. IT COMPILES BUT MAY SET YOUR COMPUTER + ON FIRE. + +MPLS layer: + +- the "MPT_HANDLER" thing is probably overkill, it likely makes more sense to + tie in the VPLS code more directly. + +VPLS: + +- I haven't implemented the control word. + +- I made a design mistake with the wire ID. It's simply not needed. A + pseudowire can be identified by its incoming label. There is also some + really ugly code keeping an array of wires... + +- The genetlink interface is not needed either. Really, the MPLS routing + table can carry all of the information. You can create a "vpls0" device + without any pseudowires, and then add routes with the neccessary info: + + "ip -f mpls route add 100 vpls vpls0 as 200 via inet 1.2.3.4 dev eth0" + + This would need the addition of a new "vpls-device" netlink attribute, and + would "split" the information in the route between RX and TX - the RX path + uses the incoming label (100) and vpls-device (vpls0) information, giving + the packet to the proper vpls device. The TX path can then reuse all the + destination information on the route -- which would make things quite a bit + simpler because we can use existing handling. + +- I only hacked in IPv4 dst info, this would be fixed by the above. + +bridge: + +- the subport code is not finished, it doesn't actually do the multicast + pieces. It should hold an array of subport indexes + an array of lifetimes + (subport IDs need to age out like ports), and the array can be passed down + on TX with RCU semantics. From 18a1436c30b293bf867bd455bfee0cf6f13a56ba Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Thu, 8 Jun 2017 12:36:59 +0200 Subject: [PATCH 7/9] vpls: add ttl configuration May be used for Generalized TTL Security Mechanism (GTSM) for the Label Distribution Protocol (LDP), RFC 6720. Signed-off-by: Amine Kherbouche --- net/mpls/vpls.c | 7 ++++++- net/mpls/vpls.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/mpls/vpls.c b/net/mpls/vpls.c index 8dcf7695d0198..296e8b2590dc3 100644 --- a/net/mpls/vpls.c +++ b/net/mpls/vpls.c @@ -34,6 +34,7 @@ struct vpls_dst { struct net_device *dev; unsigned label_in, label_out; __be32 addr; + u8 ttl; }; struct vpls_dst_list { @@ -74,7 +75,7 @@ static int vpls_xmit_dst(struct sk_buff *skb, struct vpls_priv *vpls, skb->protocol = htons(ETH_P_MPLS_UC); hdr = mpls_hdr(skb); - hdr[0] = mpls_entry_encode(dst->label_out, 255, 0, true); + hdr[0] = mpls_entry_encode(dst->label_out, dst->ttl, 0, true); err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &dst->addr, skb); if (err) @@ -376,6 +377,7 @@ static struct nla_policy vpls_genl_policy[VPLS_ATTR_MAX + 1] = { [VPLS_ATTR_LABEL_OUT] = { .type = NLA_U32 }, [VPLS_ATTR_NH_DEV] = { .type = NLA_U32 }, [VPLS_ATTR_NH_IP] = { .type = NLA_U32 }, + [VPLS_ATTR_TTL] = { .type = NLA_U8 }, }; static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info); @@ -492,6 +494,7 @@ static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info) newdsts->items[wireid].label_out = nla_get_u32(data[VPLS_ATTR_LABEL_OUT]); newdsts->items[wireid].dev = outdev; newdsts->items[wireid].addr = nla_get_u32(data[VPLS_ATTR_NH_IP]); + newdsts->items[wireid].ttl = nla_get_u8(data[VPLS_ATTR_TTL]); if (remove_lbl && remove_lbl != newdsts->items[wireid].label_in) mpls_handler_del(priv->encap_net, remove_lbl); @@ -599,6 +602,8 @@ static int vpls_nl_wire_msg(struct sk_buff *msg, struct net_device *dev, goto nla_put_failure; if (nla_put_u32(msg, VPLS_ATTR_LABEL_OUT, dst->label_out)) goto nla_put_failure; + if (nla_put_u8(msg, VPLS_ATTR_TTL, dst->ttl)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; diff --git a/net/mpls/vpls.h b/net/mpls/vpls.h index 72211e1ba7cf2..c5a1e70d7b436 100644 --- a/net/mpls/vpls.h +++ b/net/mpls/vpls.h @@ -9,6 +9,7 @@ enum { VPLS_ATTR_LABEL_OUT, VPLS_ATTR_NH_DEV, VPLS_ATTR_NH_IP, + VPLS_ATTR_TTL, __VPLS_ATTR_MAX, }; #define VPLS_ATTR_MAX (__VPLS_ATTR_MAX - 1) From 272f39a42030a7931846a7d9e6880b52bc3ca02b Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Thu, 8 Jun 2017 16:09:56 +0200 Subject: [PATCH 8/9] vpls: add ipv6 nh support Signed-off-by: Amine Kherbouche --- net/mpls/vpls.c | 38 ++++++++++++++++++++++++++++++++------ net/mpls/vpls.h | 4 ++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/net/mpls/vpls.c b/net/mpls/vpls.c index 296e8b2590dc3..2135cb1103a7c 100644 --- a/net/mpls/vpls.c +++ b/net/mpls/vpls.c @@ -30,10 +30,17 @@ #define MAXWIRES 256 +union vpls_nh { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + struct vpls_dst { struct net_device *dev; unsigned label_in, label_out; - __be32 addr; + union vpls_nh addr; + u8 via_table; + u8 flags; u8 ttl; }; @@ -77,7 +84,7 @@ static int vpls_xmit_dst(struct sk_buff *skb, struct vpls_priv *vpls, hdr = mpls_hdr(skb); hdr[0] = mpls_entry_encode(dst->label_out, dst->ttl, 0, true); - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &dst->addr, skb); + err = neigh_xmit(dst->via_table, out_dev, &dst->addr, skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); @@ -377,6 +384,7 @@ static struct nla_policy vpls_genl_policy[VPLS_ATTR_MAX + 1] = { [VPLS_ATTR_LABEL_OUT] = { .type = NLA_U32 }, [VPLS_ATTR_NH_DEV] = { .type = NLA_U32 }, [VPLS_ATTR_NH_IP] = { .type = NLA_U32 }, + [VPLS_ATTR_NH_IPV6] = { .len = sizeof(struct in6_addr) }, [VPLS_ATTR_TTL] = { .type = NLA_U8 }, }; @@ -442,7 +450,8 @@ static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info) if (!data[VPLS_ATTR_WIREID] || !data[VPLS_ATTR_IFINDEX]) return -EINVAL; - if (!data[VPLS_ATTR_NH_DEV] || !data[VPLS_ATTR_NH_IP]) + if (!data[VPLS_ATTR_NH_DEV] || !data[VPLS_ATTR_NH_IP] || + !data[VPLS_ATTR_NH_IP]) return -EINVAL; if (!data[VPLS_ATTR_LABEL_OUT] || !data[VPLS_ATTR_LABEL_IN]) return -EINVAL; @@ -493,8 +502,18 @@ static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info) newdsts->items[wireid].label_in = nla_get_u32(data[VPLS_ATTR_LABEL_IN]); newdsts->items[wireid].label_out = nla_get_u32(data[VPLS_ATTR_LABEL_OUT]); newdsts->items[wireid].dev = outdev; - newdsts->items[wireid].addr = nla_get_u32(data[VPLS_ATTR_NH_IP]); newdsts->items[wireid].ttl = nla_get_u8(data[VPLS_ATTR_TTL]); + if (data[VPLS_ATTR_NH_IP]) { + newdsts->items[wireid].addr.sin.sin_addr.s_addr = nla_get_in_addr(data[VPLS_ATTR_NH_IP]); + newdsts->items[wireid].flags |= VPLS_F_INET; + newdsts->items[wireid].via_table = NEIGH_ARP_TABLE; + } else if (data[VPLS_ATTR_NH_IPV6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + newdsts->items[wireid].addr.sin6.sin6_addr = nla_get_in6_addr(data[VPLS_ATTR_NH_IPV6]); + newdsts->items[wireid].flags |= VPLS_F_INET6; + newdsts->items[wireid].via_table = NEIGH_ND_TABLE; + } if (remove_lbl && remove_lbl != newdsts->items[wireid].label_in) mpls_handler_del(priv->encap_net, remove_lbl); @@ -596,8 +615,15 @@ static int vpls_nl_wire_msg(struct sk_buff *msg, struct net_device *dev, goto nla_put_failure; if (nla_put_u32(msg, VPLS_ATTR_NH_DEV, dst->dev->ifindex)) goto nla_put_failure; - if (nla_put_u32(msg, VPLS_ATTR_NH_IP, dst->addr)) - goto nla_put_failure; + if (dst->flags & VPLS_F_INET) { + if (nla_put_in_addr(msg, VPLS_ATTR_NH_IP, + dst->addr.sin.sin_addr.s_addr)) + goto nla_put_failure; + } else if (dst->flags & VPLS_F_INET6) { + if (nla_put_in6_addr(msg, VPLS_ATTR_NH_IPV6, + &dst->addr.sin6.sin6_addr)) + goto nla_put_failure; + } if (nla_put_u32(msg, VPLS_ATTR_LABEL_IN, dst->label_in)) goto nla_put_failure; if (nla_put_u32(msg, VPLS_ATTR_LABEL_OUT, dst->label_out)) diff --git a/net/mpls/vpls.h b/net/mpls/vpls.h index c5a1e70d7b436..1f75bc9896702 100644 --- a/net/mpls/vpls.h +++ b/net/mpls/vpls.h @@ -1,6 +1,9 @@ #ifndef _VPLS_H #define _VPLS_H +#define VPLS_F_INET 0x01 +#define VPLS_F_INET6 0x02 + enum { VPLS_ATTR_UNSPEC = 0, VPLS_ATTR_IFINDEX, @@ -9,6 +12,7 @@ enum { VPLS_ATTR_LABEL_OUT, VPLS_ATTR_NH_DEV, VPLS_ATTR_NH_IP, + VPLS_ATTR_NH_IPV6, VPLS_ATTR_TTL, __VPLS_ATTR_MAX, }; From 804a736a3c8d6bb323e58b042200184f26906efb Mon Sep 17 00:00:00 2001 From: Amine Kherbouche Date: Fri, 9 Jun 2017 10:21:53 +0200 Subject: [PATCH 9/9] vpls: add vlan support Signed-off-by: Amine Kherbouche --- net/mpls/vpls.c | 12 ++++++++++++ net/mpls/vpls.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/net/mpls/vpls.c b/net/mpls/vpls.c index 2135cb1103a7c..bf9d4bc687a80 100644 --- a/net/mpls/vpls.c +++ b/net/mpls/vpls.c @@ -39,6 +39,7 @@ struct vpls_dst { struct net_device *dev; unsigned label_in, label_out; union vpls_nh addr; + u16 vlan_id; u8 via_table; u8 flags; u8 ttl; @@ -84,6 +85,9 @@ static int vpls_xmit_dst(struct sk_buff *skb, struct vpls_priv *vpls, hdr = mpls_hdr(skb); hdr[0] = mpls_entry_encode(dst->label_out, dst->ttl, 0, true); + if (dst->flags & VPLS_F_VLAN) + skb_vlan_push(skb, htons(ETH_P_8021Q), dst->vlan_id); + err = neigh_xmit(dst->via_table, out_dev, &dst->addr, skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", @@ -386,6 +390,7 @@ static struct nla_policy vpls_genl_policy[VPLS_ATTR_MAX + 1] = { [VPLS_ATTR_NH_IP] = { .type = NLA_U32 }, [VPLS_ATTR_NH_IPV6] = { .len = sizeof(struct in6_addr) }, [VPLS_ATTR_TTL] = { .type = NLA_U8 }, + [VPLS_ATTR_VLANID] = { .type = NLA_U16 }, }; static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info); @@ -514,6 +519,10 @@ static int vpls_genl_newwire(struct sk_buff *skb, struct genl_info *info) newdsts->items[wireid].flags |= VPLS_F_INET6; newdsts->items[wireid].via_table = NEIGH_ND_TABLE; } + if (data[VPLS_ATTR_VLANID]) { + newdsts->items[wireid].vlan_id = nla_get_u16(data[VPLS_ATTR_VLANID]); + newdsts->items[wireid].flags |= VPLS_F_VLAN; + } if (remove_lbl && remove_lbl != newdsts->items[wireid].label_in) mpls_handler_del(priv->encap_net, remove_lbl); @@ -630,6 +639,9 @@ static int vpls_nl_wire_msg(struct sk_buff *msg, struct net_device *dev, goto nla_put_failure; if (nla_put_u8(msg, VPLS_ATTR_TTL, dst->ttl)) goto nla_put_failure; + if (dst->flags & VPLS_F_VLAN) + if (nla_put_u16(msg, VPLS_ATTR_VLANID, dst->vlan_id)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; diff --git a/net/mpls/vpls.h b/net/mpls/vpls.h index 1f75bc9896702..bb21e49778a7d 100644 --- a/net/mpls/vpls.h +++ b/net/mpls/vpls.h @@ -3,6 +3,7 @@ #define VPLS_F_INET 0x01 #define VPLS_F_INET6 0x02 +#define VPLS_F_VLAN 0x04 enum { VPLS_ATTR_UNSPEC = 0, @@ -14,6 +15,7 @@ enum { VPLS_ATTR_NH_IP, VPLS_ATTR_NH_IPV6, VPLS_ATTR_TTL, + VPLS_ATTR_VLANID, __VPLS_ATTR_MAX, }; #define VPLS_ATTR_MAX (__VPLS_ATTR_MAX - 1)