aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorToshiaki Makita <makita.toshiaki@lab.ntt.co.jp>2018-08-03 03:58:10 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-08-10 10:12:20 -0400
commit948d4f214fde43743c57aae0c708bff44f6345f2 (patch)
treee92df717175539790941d1b74d81cddb7c25df0a
parentb0768a86585d4d951a30ff565f19598dbbd67897 (diff)
veth: Add driver XDP
This is the basic implementation of veth driver XDP. Incoming packets are sent from the peer veth device in the form of skb, so this is generally doing the same thing as generic XDP. This itself is not so useful, but a starting point to implement other useful veth XDP features like TX and REDIRECT. This introduces NAPI when XDP is enabled, because XDP is now heavily relies on NAPI context. Use ptr_ring to emulate NIC ring. Tx function enqueues packets to the ring and peer NAPI handler drains the ring. Currently only one ring is allocated for each veth device, so it does not scale on multiqueue env. This can be resolved by allocating rings on the per-queue basis later. Note that NAPI is not used but netif_rx is used when XDP is not loaded, so this does not change the default behaviour. v6: - Check skb->len only when allocation is needed. - Add __GFP_NOWARN to alloc_page() as it can be triggered by external events. v3: - Fix race on closing the device. - Add extack messages in ndo_bpf. v2: - Squashed with the patch adding NAPI. - Implement adjust_tail. - Don't acquire consumer lock because it is guarded by NAPI. - Make poll_controller noop since it is unnecessary. - Register rxq_info on enabling XDP rather than on opening the device. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r--drivers/net/veth.c374
1 files changed, 367 insertions, 7 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39ee57e..d3b9f10bea24 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,10 +19,18 @@
19#include <net/xfrm.h> 19#include <net/xfrm.h>
20#include <linux/veth.h> 20#include <linux/veth.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/bpf.h>
23#include <linux/filter.h>
24#include <linux/ptr_ring.h>
25#include <linux/skb_array.h>
26#include <linux/bpf_trace.h>
22 27
23#define DRV_NAME "veth" 28#define DRV_NAME "veth"
24#define DRV_VERSION "1.0" 29#define DRV_VERSION "1.0"
25 30
31#define VETH_RING_SIZE 256
32#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
33
26struct pcpu_vstats { 34struct pcpu_vstats {
27 u64 packets; 35 u64 packets;
28 u64 bytes; 36 u64 bytes;
@@ -30,9 +38,16 @@ struct pcpu_vstats {
30}; 38};
31 39
32struct veth_priv { 40struct veth_priv {
41 struct napi_struct xdp_napi;
42 struct net_device *dev;
43 struct bpf_prog __rcu *xdp_prog;
44 struct bpf_prog *_xdp_prog;
33 struct net_device __rcu *peer; 45 struct net_device __rcu *peer;
34 atomic64_t dropped; 46 atomic64_t dropped;
35 unsigned requested_headroom; 47 unsigned requested_headroom;
48 bool rx_notify_masked;
49 struct ptr_ring xdp_ring;
50 struct xdp_rxq_info xdp_rxq;
36}; 51};
37 52
38/* 53/*
@@ -98,11 +113,43 @@ static const struct ethtool_ops veth_ethtool_ops = {
98 .get_link_ksettings = veth_get_link_ksettings, 113 .get_link_ksettings = veth_get_link_ksettings,
99}; 114};
100 115
101static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 116/* general routines */
117
118static void __veth_xdp_flush(struct veth_priv *priv)
119{
120 /* Write ptr_ring before reading rx_notify_masked */
121 smp_mb();
122 if (!priv->rx_notify_masked) {
123 priv->rx_notify_masked = true;
124 napi_schedule(&priv->xdp_napi);
125 }
126}
127
128static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
129{
130 if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) {
131 dev_kfree_skb_any(skb);
132 return NET_RX_DROP;
133 }
134
135 return NET_RX_SUCCESS;
136}
137
138static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool xdp)
102{ 139{
103 struct veth_priv *priv = netdev_priv(dev); 140 struct veth_priv *priv = netdev_priv(dev);
141
142 return __dev_forward_skb(dev, skb) ?: xdp ?
143 veth_xdp_rx(priv, skb) :
144 netif_rx(skb);
145}
146
147static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
148{
149 struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
104 struct net_device *rcv; 150 struct net_device *rcv;
105 int length = skb->len; 151 int length = skb->len;
152 bool rcv_xdp = false;
106 153
107 rcu_read_lock(); 154 rcu_read_lock();
108 rcv = rcu_dereference(priv->peer); 155 rcv = rcu_dereference(priv->peer);
@@ -111,7 +158,10 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
111 goto drop; 158 goto drop;
112 } 159 }
113 160
114 if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { 161 rcv_priv = netdev_priv(rcv);
162 rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
163
164 if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
115 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 165 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
116 166
117 u64_stats_update_begin(&stats->syncp); 167 u64_stats_update_begin(&stats->syncp);
@@ -122,14 +172,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
122drop: 172drop:
123 atomic64_inc(&priv->dropped); 173 atomic64_inc(&priv->dropped);
124 } 174 }
175
176 if (rcv_xdp)
177 __veth_xdp_flush(rcv_priv);
178
125 rcu_read_unlock(); 179 rcu_read_unlock();
180
126 return NETDEV_TX_OK; 181 return NETDEV_TX_OK;
127} 182}
128 183
129/*
130 * general routines
131 */
132
133static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 184static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
134{ 185{
135 struct veth_priv *priv = netdev_priv(dev); 186 struct veth_priv *priv = netdev_priv(dev);
@@ -179,18 +230,254 @@ static void veth_set_multicast_list(struct net_device *dev)
179{ 230{
180} 231}
181 232
233static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
234 int buflen)
235{
236 struct sk_buff *skb;
237
238 if (!buflen) {
239 buflen = SKB_DATA_ALIGN(headroom + len) +
240 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
241 }
242 skb = build_skb(head, buflen);
243 if (!skb)
244 return NULL;
245
246 skb_reserve(skb, headroom);
247 skb_put(skb, len);
248
249 return skb;
250}
251
252static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
253 struct sk_buff *skb)
254{
255 u32 pktlen, headroom, act, metalen;
256 void *orig_data, *orig_data_end;
257 struct bpf_prog *xdp_prog;
258 int mac_len, delta, off;
259 struct xdp_buff xdp;
260
261 rcu_read_lock();
262 xdp_prog = rcu_dereference(priv->xdp_prog);
263 if (unlikely(!xdp_prog)) {
264 rcu_read_unlock();
265 goto out;
266 }
267
268 mac_len = skb->data - skb_mac_header(skb);
269 pktlen = skb->len + mac_len;
270 headroom = skb_headroom(skb) - mac_len;
271
272 if (skb_shared(skb) || skb_head_is_locked(skb) ||
273 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
274 struct sk_buff *nskb;
275 int size, head_off;
276 void *head, *start;
277 struct page *page;
278
279 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
280 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
281 if (size > PAGE_SIZE)
282 goto drop;
283
284 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
285 if (!page)
286 goto drop;
287
288 head = page_address(page);
289 start = head + VETH_XDP_HEADROOM;
290 if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
291 page_frag_free(head);
292 goto drop;
293 }
294
295 nskb = veth_build_skb(head,
296 VETH_XDP_HEADROOM + mac_len, skb->len,
297 PAGE_SIZE);
298 if (!nskb) {
299 page_frag_free(head);
300 goto drop;
301 }
302
303 skb_copy_header(nskb, skb);
304 head_off = skb_headroom(nskb) - skb_headroom(skb);
305 skb_headers_offset_update(nskb, head_off);
306 if (skb->sk)
307 skb_set_owner_w(nskb, skb->sk);
308 consume_skb(skb);
309 skb = nskb;
310 }
311
312 xdp.data_hard_start = skb->head;
313 xdp.data = skb_mac_header(skb);
314 xdp.data_end = xdp.data + pktlen;
315 xdp.data_meta = xdp.data;
316 xdp.rxq = &priv->xdp_rxq;
317 orig_data = xdp.data;
318 orig_data_end = xdp.data_end;
319
320 act = bpf_prog_run_xdp(xdp_prog, &xdp);
321
322 switch (act) {
323 case XDP_PASS:
324 break;
325 default:
326 bpf_warn_invalid_xdp_action(act);
327 case XDP_ABORTED:
328 trace_xdp_exception(priv->dev, xdp_prog, act);
329 case XDP_DROP:
330 goto drop;
331 }
332 rcu_read_unlock();
333
334 delta = orig_data - xdp.data;
335 off = mac_len + delta;
336 if (off > 0)
337 __skb_push(skb, off);
338 else if (off < 0)
339 __skb_pull(skb, -off);
340 skb->mac_header -= delta;
341 off = xdp.data_end - orig_data_end;
342 if (off != 0)
343 __skb_put(skb, off);
344 skb->protocol = eth_type_trans(skb, priv->dev);
345
346 metalen = xdp.data - xdp.data_meta;
347 if (metalen)
348 skb_metadata_set(skb, metalen);
349out:
350 return skb;
351drop:
352 rcu_read_unlock();
353 kfree_skb(skb);
354 return NULL;
355}
356
357static int veth_xdp_rcv(struct veth_priv *priv, int budget)
358{
359 int i, done = 0;
360
361 for (i = 0; i < budget; i++) {
362 struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
363
364 if (!skb)
365 break;
366
367 skb = veth_xdp_rcv_skb(priv, skb);
368
369 if (skb)
370 napi_gro_receive(&priv->xdp_napi, skb);
371
372 done++;
373 }
374
375 return done;
376}
377
378static int veth_poll(struct napi_struct *napi, int budget)
379{
380 struct veth_priv *priv =
381 container_of(napi, struct veth_priv, xdp_napi);
382 int done;
383
384 done = veth_xdp_rcv(priv, budget);
385
386 if (done < budget && napi_complete_done(napi, done)) {
387 /* Write rx_notify_masked before reading ptr_ring */
388 smp_store_mb(priv->rx_notify_masked, false);
389 if (unlikely(!__ptr_ring_empty(&priv->xdp_ring))) {
390 priv->rx_notify_masked = true;
391 napi_schedule(&priv->xdp_napi);
392 }
393 }
394
395 return done;
396}
397
398static int veth_napi_add(struct net_device *dev)
399{
400 struct veth_priv *priv = netdev_priv(dev);
401 int err;
402
403 err = ptr_ring_init(&priv->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
404 if (err)
405 return err;
406
407 netif_napi_add(dev, &priv->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
408 napi_enable(&priv->xdp_napi);
409
410 return 0;
411}
412
413static void veth_napi_del(struct net_device *dev)
414{
415 struct veth_priv *priv = netdev_priv(dev);
416
417 napi_disable(&priv->xdp_napi);
418 netif_napi_del(&priv->xdp_napi);
419 priv->rx_notify_masked = false;
420 ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
421}
422
423static int veth_enable_xdp(struct net_device *dev)
424{
425 struct veth_priv *priv = netdev_priv(dev);
426 int err;
427
428 if (!xdp_rxq_info_is_reg(&priv->xdp_rxq)) {
429 err = xdp_rxq_info_reg(&priv->xdp_rxq, dev, 0);
430 if (err < 0)
431 return err;
432
433 err = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq,
434 MEM_TYPE_PAGE_SHARED, NULL);
435 if (err < 0)
436 goto err;
437
438 err = veth_napi_add(dev);
439 if (err)
440 goto err;
441 }
442
443 rcu_assign_pointer(priv->xdp_prog, priv->_xdp_prog);
444
445 return 0;
446err:
447 xdp_rxq_info_unreg(&priv->xdp_rxq);
448
449 return err;
450}
451
452static void veth_disable_xdp(struct net_device *dev)
453{
454 struct veth_priv *priv = netdev_priv(dev);
455
456 rcu_assign_pointer(priv->xdp_prog, NULL);
457 veth_napi_del(dev);
458 xdp_rxq_info_unreg(&priv->xdp_rxq);
459}
460
182static int veth_open(struct net_device *dev) 461static int veth_open(struct net_device *dev)
183{ 462{
184 struct veth_priv *priv = netdev_priv(dev); 463 struct veth_priv *priv = netdev_priv(dev);
185 struct net_device *peer = rtnl_dereference(priv->peer); 464 struct net_device *peer = rtnl_dereference(priv->peer);
465 int err;
186 466
187 if (!peer) 467 if (!peer)
188 return -ENOTCONN; 468 return -ENOTCONN;
189 469
470 if (priv->_xdp_prog) {
471 err = veth_enable_xdp(dev);
472 if (err)
473 return err;
474 }
475
190 if (peer->flags & IFF_UP) { 476 if (peer->flags & IFF_UP) {
191 netif_carrier_on(dev); 477 netif_carrier_on(dev);
192 netif_carrier_on(peer); 478 netif_carrier_on(peer);
193 } 479 }
480
194 return 0; 481 return 0;
195} 482}
196 483
@@ -203,6 +490,9 @@ static int veth_close(struct net_device *dev)
203 if (peer) 490 if (peer)
204 netif_carrier_off(peer); 491 netif_carrier_off(peer);
205 492
493 if (priv->_xdp_prog)
494 veth_disable_xdp(dev);
495
206 return 0; 496 return 0;
207} 497}
208 498
@@ -228,7 +518,7 @@ static void veth_dev_free(struct net_device *dev)
228static void veth_poll_controller(struct net_device *dev) 518static void veth_poll_controller(struct net_device *dev)
229{ 519{
230 /* veth only receives frames when its peer sends one 520 /* veth only receives frames when its peer sends one
231 * Since it's a synchronous operation, we are guaranteed 521 * Since it has nothing to do with disabling irqs, we are guaranteed
232 * never to have pending data when we poll for it so 522 * never to have pending data when we poll for it so
233 * there is nothing to do here. 523 * there is nothing to do here.
234 * 524 *
@@ -276,6 +566,72 @@ out:
276 rcu_read_unlock(); 566 rcu_read_unlock();
277} 567}
278 568
569static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
570 struct netlink_ext_ack *extack)
571{
572 struct veth_priv *priv = netdev_priv(dev);
573 struct bpf_prog *old_prog;
574 struct net_device *peer;
575 int err;
576
577 old_prog = priv->_xdp_prog;
578 priv->_xdp_prog = prog;
579 peer = rtnl_dereference(priv->peer);
580
581 if (prog) {
582 if (!peer) {
583 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
584 err = -ENOTCONN;
585 goto err;
586 }
587
588 if (dev->flags & IFF_UP) {
589 err = veth_enable_xdp(dev);
590 if (err) {
591 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
592 goto err;
593 }
594 }
595 }
596
597 if (old_prog) {
598 if (!prog && dev->flags & IFF_UP)
599 veth_disable_xdp(dev);
600 bpf_prog_put(old_prog);
601 }
602
603 return 0;
604err:
605 priv->_xdp_prog = old_prog;
606
607 return err;
608}
609
610static u32 veth_xdp_query(struct net_device *dev)
611{
612 struct veth_priv *priv = netdev_priv(dev);
613 const struct bpf_prog *xdp_prog;
614
615 xdp_prog = priv->_xdp_prog;
616 if (xdp_prog)
617 return xdp_prog->aux->id;
618
619 return 0;
620}
621
622static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
623{
624 switch (xdp->command) {
625 case XDP_SETUP_PROG:
626 return veth_xdp_set(dev, xdp->prog, xdp->extack);
627 case XDP_QUERY_PROG:
628 xdp->prog_id = veth_xdp_query(dev);
629 return 0;
630 default:
631 return -EINVAL;
632 }
633}
634
279static const struct net_device_ops veth_netdev_ops = { 635static const struct net_device_ops veth_netdev_ops = {
280 .ndo_init = veth_dev_init, 636 .ndo_init = veth_dev_init,
281 .ndo_open = veth_open, 637 .ndo_open = veth_open,
@@ -290,6 +646,7 @@ static const struct net_device_ops veth_netdev_ops = {
290 .ndo_get_iflink = veth_get_iflink, 646 .ndo_get_iflink = veth_get_iflink,
291 .ndo_features_check = passthru_features_check, 647 .ndo_features_check = passthru_features_check,
292 .ndo_set_rx_headroom = veth_set_rx_headroom, 648 .ndo_set_rx_headroom = veth_set_rx_headroom,
649 .ndo_bpf = veth_xdp,
293}; 650};
294 651
295#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 652#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -451,10 +808,13 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
451 */ 808 */
452 809
453 priv = netdev_priv(dev); 810 priv = netdev_priv(dev);
811 priv->dev = dev;
454 rcu_assign_pointer(priv->peer, peer); 812 rcu_assign_pointer(priv->peer, peer);
455 813
456 priv = netdev_priv(peer); 814 priv = netdev_priv(peer);
815 priv->dev = peer;
457 rcu_assign_pointer(priv->peer, dev); 816 rcu_assign_pointer(priv->peer, dev);
817
458 return 0; 818 return 0;
459 819
460err_register_dev: 820err_register_dev: