diff options
author | David S. Miller <davem@davemloft.net> | 2017-04-18 15:36:58 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-04-25 13:33:49 -0400 |
commit | b5cdae3291f7be7a34e75affe4c0ec1f7f328b64 (patch) | |
tree | ca1a8fc4ef95aa3e6e66353791dcb74cad8bc0c2 | |
parent | 2f7878c06e2d227aa5c405ddde356403b83e3509 (diff) |
net: Generic XDP
This provides a generic SKB based non-optimized XDP path which is used
if either the driver lacks a specific XDP implementation, or the user
requests it via a new IFLA_XDP_FLAGS value named XDP_FLAGS_SKB_MODE.
It is arguable that perhaps I should have required something like
this as part of the initial XDP feature merge.
I believe this is critical for two reasons:
1) Accessibility. More people can play with XDP with less
dependencies. Yes I know we have XDP support in virtio_net, but
that just creates another depedency for learning how to use this
facility.
I wrote this to make life easier for the XDP newbies.
2) As a model for what the expected semantics are. If there is a pure
generic core implementation, it serves as a semantic example for
driver folks adding XDP support.
One thing I have not tried to address here is the issue of
XDP_PACKET_HEADROOM, thanks to Daniel for spotting that. It seems
incredibly expensive to do a skb_cow(skb, XDP_PACKET_HEADROOM) or
whatever even if the XDP program doesn't try to push headers at all.
I think we really need the verifier to somehow propagate whether
certain XDP helpers are used or not.
v5:
- Handle both negative and positive offset after running prog
- Fix mac length in XDP_TX case (Alexei)
- Use rcu_dereference_protected() in free_netdev (kbuild test robot)
v4:
- Fix MAC header adjustmnet before calling prog (David Ahern)
- Disable LRO when generic XDP is installed (Michael Chan)
- Bypass qdisc et al. on XDP_TX and record the event (Alexei)
- Do not perform generic XDP on reinjected packets (DaveM)
v3:
- Make sure XDP program sees packet at MAC header, push back MAC
header if we do XDP_TX. (Alexei)
- Elide GRO when generic XDP is in use. (Alexei)
- Add XDP_FLAG_SKB_MODE flag which the user can use to request generic
XDP even if the driver has an XDP implementation. (Alexei)
- Report whether SKB mode is in use in rtnl_xdp_fill() via XDP_FLAGS
attribute. (Daniel)
v2:
- Add some "fall through" comments in switch statements based
upon feedback from Andrew Lunn
- Use RCU for generic xdp_prog, thanks to Johannes Berg.
Tested-by: Andy Gospodarek <andy@greyhouse.net>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/netdevice.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 4 | ||||
-rw-r--r-- | net/core/dev.c | 155 | ||||
-rw-r--r-- | net/core/gro_cells.c | 2 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 40 |
5 files changed, 187 insertions, 22 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5d5267febd56..46d220c2bf92 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -1905,9 +1905,17 @@ struct net_device { | |||
1905 | struct lock_class_key *qdisc_tx_busylock; | 1905 | struct lock_class_key *qdisc_tx_busylock; |
1906 | struct lock_class_key *qdisc_running_key; | 1906 | struct lock_class_key *qdisc_running_key; |
1907 | bool proto_down; | 1907 | bool proto_down; |
1908 | struct bpf_prog __rcu *xdp_prog; | ||
1908 | }; | 1909 | }; |
1909 | #define to_net_dev(d) container_of(d, struct net_device, dev) | 1910 | #define to_net_dev(d) container_of(d, struct net_device, dev) |
1910 | 1911 | ||
1912 | static inline bool netif_elide_gro(const struct net_device *dev) | ||
1913 | { | ||
1914 | if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) | ||
1915 | return true; | ||
1916 | return false; | ||
1917 | } | ||
1918 | |||
1911 | #define NETDEV_ALIGN 32 | 1919 | #define NETDEV_ALIGN 32 |
1912 | 1920 | ||
1913 | static inline | 1921 | static inline |
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8b405afb2376..633aa0276d32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h | |||
@@ -887,7 +887,9 @@ enum { | |||
887 | /* XDP section */ | 887 | /* XDP section */ |
888 | 888 | ||
889 | #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) | 889 | #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) |
890 | #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST) | 890 | #define XDP_FLAGS_SKB_MODE (2U << 0) |
891 | #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ | ||
892 | XDP_FLAGS_SKB_MODE) | ||
891 | 893 | ||
892 | enum { | 894 | enum { |
893 | IFLA_XDP_UNSPEC, | 895 | IFLA_XDP_UNSPEC, |
diff --git a/net/core/dev.c b/net/core/dev.c index db6e31564d06..1b3317c026c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -95,6 +95,7 @@ | |||
95 | #include <linux/notifier.h> | 95 | #include <linux/notifier.h> |
96 | #include <linux/skbuff.h> | 96 | #include <linux/skbuff.h> |
97 | #include <linux/bpf.h> | 97 | #include <linux/bpf.h> |
98 | #include <linux/bpf_trace.h> | ||
98 | #include <net/net_namespace.h> | 99 | #include <net/net_namespace.h> |
99 | #include <net/sock.h> | 100 | #include <net/sock.h> |
100 | #include <net/busy_poll.h> | 101 | #include <net/busy_poll.h> |
@@ -4251,6 +4252,125 @@ static int __netif_receive_skb(struct sk_buff *skb) | |||
4251 | return ret; | 4252 | return ret; |
4252 | } | 4253 | } |
4253 | 4254 | ||
4255 | static struct static_key generic_xdp_needed __read_mostly; | ||
4256 | |||
4257 | static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) | ||
4258 | { | ||
4259 | struct bpf_prog *new = xdp->prog; | ||
4260 | int ret = 0; | ||
4261 | |||
4262 | switch (xdp->command) { | ||
4263 | case XDP_SETUP_PROG: { | ||
4264 | struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); | ||
4265 | |||
4266 | rcu_assign_pointer(dev->xdp_prog, new); | ||
4267 | if (old) | ||
4268 | bpf_prog_put(old); | ||
4269 | |||
4270 | if (old && !new) { | ||
4271 | static_key_slow_dec(&generic_xdp_needed); | ||
4272 | } else if (new && !old) { | ||
4273 | static_key_slow_inc(&generic_xdp_needed); | ||
4274 | dev_disable_lro(dev); | ||
4275 | } | ||
4276 | break; | ||
4277 | } | ||
4278 | |||
4279 | case XDP_QUERY_PROG: | ||
4280 | xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); | ||
4281 | break; | ||
4282 | |||
4283 | default: | ||
4284 | ret = -EINVAL; | ||
4285 | break; | ||
4286 | } | ||
4287 | |||
4288 | return ret; | ||
4289 | } | ||
4290 | |||
4291 | static u32 netif_receive_generic_xdp(struct sk_buff *skb, | ||
4292 | struct bpf_prog *xdp_prog) | ||
4293 | { | ||
4294 | struct xdp_buff xdp; | ||
4295 | u32 act = XDP_DROP; | ||
4296 | void *orig_data; | ||
4297 | int hlen, off; | ||
4298 | u32 mac_len; | ||
4299 | |||
4300 | /* Reinjected packets coming from act_mirred or similar should | ||
4301 | * not get XDP generic processing. | ||
4302 | */ | ||
4303 | if (skb_cloned(skb)) | ||
4304 | return XDP_PASS; | ||
4305 | |||
4306 | if (skb_linearize(skb)) | ||
4307 | goto do_drop; | ||
4308 | |||
4309 | /* The XDP program wants to see the packet starting at the MAC | ||
4310 | * header. | ||
4311 | */ | ||
4312 | mac_len = skb->data - skb_mac_header(skb); | ||
4313 | hlen = skb_headlen(skb) + mac_len; | ||
4314 | xdp.data = skb->data - mac_len; | ||
4315 | xdp.data_end = xdp.data + hlen; | ||
4316 | xdp.data_hard_start = skb->data - skb_headroom(skb); | ||
4317 | orig_data = xdp.data; | ||
4318 | |||
4319 | act = bpf_prog_run_xdp(xdp_prog, &xdp); | ||
4320 | |||
4321 | off = xdp.data - orig_data; | ||
4322 | if (off > 0) | ||
4323 | __skb_pull(skb, off); | ||
4324 | else if (off < 0) | ||
4325 | __skb_push(skb, -off); | ||
4326 | |||
4327 | switch (act) { | ||
4328 | case XDP_TX: | ||
4329 | __skb_push(skb, mac_len); | ||
4330 | /* fall through */ | ||
4331 | case XDP_PASS: | ||
4332 | break; | ||
4333 | |||
4334 | default: | ||
4335 | bpf_warn_invalid_xdp_action(act); | ||
4336 | /* fall through */ | ||
4337 | case XDP_ABORTED: | ||
4338 | trace_xdp_exception(skb->dev, xdp_prog, act); | ||
4339 | /* fall through */ | ||
4340 | case XDP_DROP: | ||
4341 | do_drop: | ||
4342 | kfree_skb(skb); | ||
4343 | break; | ||
4344 | } | ||
4345 | |||
4346 | return act; | ||
4347 | } | ||
4348 | |||
4349 | /* When doing generic XDP we have to bypass the qdisc layer and the | ||
4350 | * network taps in order to match in-driver-XDP behavior. | ||
4351 | */ | ||
4352 | static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) | ||
4353 | { | ||
4354 | struct net_device *dev = skb->dev; | ||
4355 | struct netdev_queue *txq; | ||
4356 | bool free_skb = true; | ||
4357 | int cpu, rc; | ||
4358 | |||
4359 | txq = netdev_pick_tx(dev, skb, NULL); | ||
4360 | cpu = smp_processor_id(); | ||
4361 | HARD_TX_LOCK(dev, txq, cpu); | ||
4362 | if (!netif_xmit_stopped(txq)) { | ||
4363 | rc = netdev_start_xmit(skb, dev, txq, 0); | ||
4364 | if (dev_xmit_complete(rc)) | ||
4365 | free_skb = false; | ||
4366 | } | ||
4367 | HARD_TX_UNLOCK(dev, txq); | ||
4368 | if (free_skb) { | ||
4369 | trace_xdp_exception(dev, xdp_prog, XDP_TX); | ||
4370 | kfree_skb(skb); | ||
4371 | } | ||
4372 | } | ||
4373 | |||
4254 | static int netif_receive_skb_internal(struct sk_buff *skb) | 4374 | static int netif_receive_skb_internal(struct sk_buff *skb) |
4255 | { | 4375 | { |
4256 | int ret; | 4376 | int ret; |
@@ -4262,6 +4382,21 @@ static int netif_receive_skb_internal(struct sk_buff *skb) | |||
4262 | 4382 | ||
4263 | rcu_read_lock(); | 4383 | rcu_read_lock(); |
4264 | 4384 | ||
4385 | if (static_key_false(&generic_xdp_needed)) { | ||
4386 | struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); | ||
4387 | |||
4388 | if (xdp_prog) { | ||
4389 | u32 act = netif_receive_generic_xdp(skb, xdp_prog); | ||
4390 | |||
4391 | if (act != XDP_PASS) { | ||
4392 | rcu_read_unlock(); | ||
4393 | if (act == XDP_TX) | ||
4394 | generic_xdp_tx(skb, xdp_prog); | ||
4395 | return NET_RX_DROP; | ||
4396 | } | ||
4397 | } | ||
4398 | } | ||
4399 | |||
4265 | #ifdef CONFIG_RPS | 4400 | #ifdef CONFIG_RPS |
4266 | if (static_key_false(&rps_needed)) { | 4401 | if (static_key_false(&rps_needed)) { |
4267 | struct rps_dev_flow voidflow, *rflow = &voidflow; | 4402 | struct rps_dev_flow voidflow, *rflow = &voidflow; |
@@ -4494,7 +4629,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff | |||
4494 | enum gro_result ret; | 4629 | enum gro_result ret; |
4495 | int grow; | 4630 | int grow; |
4496 | 4631 | ||
4497 | if (!(skb->dev->features & NETIF_F_GRO)) | 4632 | if (netif_elide_gro(skb->dev)) |
4498 | goto normal; | 4633 | goto normal; |
4499 | 4634 | ||
4500 | if (skb->csum_bad) | 4635 | if (skb->csum_bad) |
@@ -6723,6 +6858,7 @@ EXPORT_SYMBOL(dev_change_proto_down); | |||
6723 | */ | 6858 | */ |
6724 | int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) | 6859 | int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) |
6725 | { | 6860 | { |
6861 | int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); | ||
6726 | const struct net_device_ops *ops = dev->netdev_ops; | 6862 | const struct net_device_ops *ops = dev->netdev_ops; |
6727 | struct bpf_prog *prog = NULL; | 6863 | struct bpf_prog *prog = NULL; |
6728 | struct netdev_xdp xdp; | 6864 | struct netdev_xdp xdp; |
@@ -6730,14 +6866,16 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) | |||
6730 | 6866 | ||
6731 | ASSERT_RTNL(); | 6867 | ASSERT_RTNL(); |
6732 | 6868 | ||
6733 | if (!ops->ndo_xdp) | 6869 | xdp_op = ops->ndo_xdp; |
6734 | return -EOPNOTSUPP; | 6870 | if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) |
6871 | xdp_op = generic_xdp_install; | ||
6872 | |||
6735 | if (fd >= 0) { | 6873 | if (fd >= 0) { |
6736 | if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { | 6874 | if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { |
6737 | memset(&xdp, 0, sizeof(xdp)); | 6875 | memset(&xdp, 0, sizeof(xdp)); |
6738 | xdp.command = XDP_QUERY_PROG; | 6876 | xdp.command = XDP_QUERY_PROG; |
6739 | 6877 | ||
6740 | err = ops->ndo_xdp(dev, &xdp); | 6878 | err = xdp_op(dev, &xdp); |
6741 | if (err < 0) | 6879 | if (err < 0) |
6742 | return err; | 6880 | return err; |
6743 | if (xdp.prog_attached) | 6881 | if (xdp.prog_attached) |
@@ -6753,7 +6891,7 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) | |||
6753 | xdp.command = XDP_SETUP_PROG; | 6891 | xdp.command = XDP_SETUP_PROG; |
6754 | xdp.prog = prog; | 6892 | xdp.prog = prog; |
6755 | 6893 | ||
6756 | err = ops->ndo_xdp(dev, &xdp); | 6894 | err = xdp_op(dev, &xdp); |
6757 | if (err < 0 && prog) | 6895 | if (err < 0 && prog) |
6758 | bpf_prog_put(prog); | 6896 | bpf_prog_put(prog); |
6759 | 6897 | ||
@@ -7793,6 +7931,7 @@ EXPORT_SYMBOL(alloc_netdev_mqs); | |||
7793 | void free_netdev(struct net_device *dev) | 7931 | void free_netdev(struct net_device *dev) |
7794 | { | 7932 | { |
7795 | struct napi_struct *p, *n; | 7933 | struct napi_struct *p, *n; |
7934 | struct bpf_prog *prog; | ||
7796 | 7935 | ||
7797 | might_sleep(); | 7936 | might_sleep(); |
7798 | netif_free_tx_queues(dev); | 7937 | netif_free_tx_queues(dev); |
@@ -7811,6 +7950,12 @@ void free_netdev(struct net_device *dev) | |||
7811 | free_percpu(dev->pcpu_refcnt); | 7950 | free_percpu(dev->pcpu_refcnt); |
7812 | dev->pcpu_refcnt = NULL; | 7951 | dev->pcpu_refcnt = NULL; |
7813 | 7952 | ||
7953 | prog = rcu_dereference_protected(dev->xdp_prog, 1); | ||
7954 | if (prog) { | ||
7955 | bpf_prog_put(prog); | ||
7956 | static_key_slow_dec(&generic_xdp_needed); | ||
7957 | } | ||
7958 | |||
7814 | /* Compatibility with error handling in drivers */ | 7959 | /* Compatibility with error handling in drivers */ |
7815 | if (dev->reg_state == NETREG_UNINITIALIZED) { | 7960 | if (dev->reg_state == NETREG_UNINITIALIZED) { |
7816 | netdev_freemem(dev); | 7961 | netdev_freemem(dev); |
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index c98bbfbd26b8..814e58a3ce8b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c | |||
@@ -13,7 +13,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) | |||
13 | struct net_device *dev = skb->dev; | 13 | struct net_device *dev = skb->dev; |
14 | struct gro_cell *cell; | 14 | struct gro_cell *cell; |
15 | 15 | ||
16 | if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) | 16 | if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) |
17 | return netif_rx(skb); | 17 | return netif_rx(skb); |
18 | 18 | ||
19 | cell = this_cpu_ptr(gcells->cells); | 19 | cell = this_cpu_ptr(gcells->cells); |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 088f9c8b4196..9031a6c8bfa7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c | |||
@@ -896,15 +896,13 @@ static size_t rtnl_port_size(const struct net_device *dev, | |||
896 | return port_self_size; | 896 | return port_self_size; |
897 | } | 897 | } |
898 | 898 | ||
899 | static size_t rtnl_xdp_size(const struct net_device *dev) | 899 | static size_t rtnl_xdp_size(void) |
900 | { | 900 | { |
901 | size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ | 901 | size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ |
902 | nla_total_size(1); /* XDP_ATTACHED */ | 902 | nla_total_size(1) + /* XDP_ATTACHED */ |
903 | nla_total_size(4); /* XDP_FLAGS */ | ||
903 | 904 | ||
904 | if (!dev->netdev_ops->ndo_xdp) | 905 | return xdp_size; |
905 | return 0; | ||
906 | else | ||
907 | return xdp_size; | ||
908 | } | 906 | } |
909 | 907 | ||
910 | static noinline size_t if_nlmsg_size(const struct net_device *dev, | 908 | static noinline size_t if_nlmsg_size(const struct net_device *dev, |
@@ -943,7 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, | |||
943 | + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ | 941 | + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ |
944 | + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ | 942 | + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ |
945 | + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ | 943 | + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ |
946 | + rtnl_xdp_size(dev) /* IFLA_XDP */ | 944 | + rtnl_xdp_size() /* IFLA_XDP */ |
947 | + nla_total_size(1); /* IFLA_PROTO_DOWN */ | 945 | + nla_total_size(1); /* IFLA_PROTO_DOWN */ |
948 | 946 | ||
949 | } | 947 | } |
@@ -1251,23 +1249,35 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) | |||
1251 | 1249 | ||
1252 | static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) | 1250 | static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) |
1253 | { | 1251 | { |
1254 | struct netdev_xdp xdp_op = {}; | ||
1255 | struct nlattr *xdp; | 1252 | struct nlattr *xdp; |
1253 | u32 xdp_flags = 0; | ||
1254 | u8 val = 0; | ||
1256 | int err; | 1255 | int err; |
1257 | 1256 | ||
1258 | if (!dev->netdev_ops->ndo_xdp) | ||
1259 | return 0; | ||
1260 | xdp = nla_nest_start(skb, IFLA_XDP); | 1257 | xdp = nla_nest_start(skb, IFLA_XDP); |
1261 | if (!xdp) | 1258 | if (!xdp) |
1262 | return -EMSGSIZE; | 1259 | return -EMSGSIZE; |
1263 | xdp_op.command = XDP_QUERY_PROG; | 1260 | if (rcu_access_pointer(dev->xdp_prog)) { |
1264 | err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); | 1261 | xdp_flags = XDP_FLAGS_SKB_MODE; |
1265 | if (err) | 1262 | val = 1; |
1266 | goto err_cancel; | 1263 | } else if (dev->netdev_ops->ndo_xdp) { |
1267 | err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); | 1264 | struct netdev_xdp xdp_op = {}; |
1265 | |||
1266 | xdp_op.command = XDP_QUERY_PROG; | ||
1267 | err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); | ||
1268 | if (err) | ||
1269 | goto err_cancel; | ||
1270 | val = xdp_op.prog_attached; | ||
1271 | } | ||
1272 | err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val); | ||
1268 | if (err) | 1273 | if (err) |
1269 | goto err_cancel; | 1274 | goto err_cancel; |
1270 | 1275 | ||
1276 | if (xdp_flags) { | ||
1277 | err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags); | ||
1278 | if (err) | ||
1279 | goto err_cancel; | ||
1280 | } | ||
1271 | nla_nest_end(skb, xdp); | 1281 | nla_nest_end(skb, xdp); |
1272 | return 0; | 1282 | return 0; |
1273 | 1283 | ||