aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/net/veth.c374
1 files changed, 367 insertions, 7 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39ee57e..d3b9f10bea24 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,10 +19,18 @@
19#include <net/xfrm.h> 19#include <net/xfrm.h>
20#include <linux/veth.h> 20#include <linux/veth.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/bpf.h>
23#include <linux/filter.h>
24#include <linux/ptr_ring.h>
25#include <linux/skb_array.h>
26#include <linux/bpf_trace.h>
22 27
23#define DRV_NAME "veth" 28#define DRV_NAME "veth"
24#define DRV_VERSION "1.0" 29#define DRV_VERSION "1.0"
25 30
31#define VETH_RING_SIZE 256
32#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
33
26struct pcpu_vstats { 34struct pcpu_vstats {
27 u64 packets; 35 u64 packets;
28 u64 bytes; 36 u64 bytes;
@@ -30,9 +38,16 @@ struct pcpu_vstats {
30}; 38};
31 39
32struct veth_priv { 40struct veth_priv {
41 struct napi_struct xdp_napi;
42 struct net_device *dev;
43 struct bpf_prog __rcu *xdp_prog;
44 struct bpf_prog *_xdp_prog;
33 struct net_device __rcu *peer; 45 struct net_device __rcu *peer;
34 atomic64_t dropped; 46 atomic64_t dropped;
35 unsigned requested_headroom; 47 unsigned requested_headroom;
48 bool rx_notify_masked;
49 struct ptr_ring xdp_ring;
50 struct xdp_rxq_info xdp_rxq;
36}; 51};
37 52
38/* 53/*
@@ -98,11 +113,43 @@ static const struct ethtool_ops veth_ethtool_ops = {
98 .get_link_ksettings = veth_get_link_ksettings, 113 .get_link_ksettings = veth_get_link_ksettings,
99}; 114};
100 115
101static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 116/* general routines */
117
118static void __veth_xdp_flush(struct veth_priv *priv)
119{
120 /* Write ptr_ring before reading rx_notify_masked */
121 smp_mb();
122 if (!priv->rx_notify_masked) {
123 priv->rx_notify_masked = true;
124 napi_schedule(&priv->xdp_napi);
125 }
126}
127
128static int veth_xdp_rx(struct veth_priv *priv, struct sk_buff *skb)
129{
130 if (unlikely(ptr_ring_produce(&priv->xdp_ring, skb))) {
131 dev_kfree_skb_any(skb);
132 return NET_RX_DROP;
133 }
134
135 return NET_RX_SUCCESS;
136}
137
138static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, bool xdp)
102{ 139{
103 struct veth_priv *priv = netdev_priv(dev); 140 struct veth_priv *priv = netdev_priv(dev);
141
142 return __dev_forward_skb(dev, skb) ?: xdp ?
143 veth_xdp_rx(priv, skb) :
144 netif_rx(skb);
145}
146
147static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
148{
149 struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
104 struct net_device *rcv; 150 struct net_device *rcv;
105 int length = skb->len; 151 int length = skb->len;
152 bool rcv_xdp = false;
106 153
107 rcu_read_lock(); 154 rcu_read_lock();
108 rcv = rcu_dereference(priv->peer); 155 rcv = rcu_dereference(priv->peer);
@@ -111,7 +158,10 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
111 goto drop; 158 goto drop;
112 } 159 }
113 160
114 if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { 161 rcv_priv = netdev_priv(rcv);
162 rcv_xdp = rcu_access_pointer(rcv_priv->xdp_prog);
163
164 if (likely(veth_forward_skb(rcv, skb, rcv_xdp) == NET_RX_SUCCESS)) {
115 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 165 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
116 166
117 u64_stats_update_begin(&stats->syncp); 167 u64_stats_update_begin(&stats->syncp);
@@ -122,14 +172,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
122drop: 172drop:
123 atomic64_inc(&priv->dropped); 173 atomic64_inc(&priv->dropped);
124 } 174 }
175
176 if (rcv_xdp)
177 __veth_xdp_flush(rcv_priv);
178
125 rcu_read_unlock(); 179 rcu_read_unlock();
180
126 return NETDEV_TX_OK; 181 return NETDEV_TX_OK;
127} 182}
128 183
129/*
130 * general routines
131 */
132
133static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 184static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
134{ 185{
135 struct veth_priv *priv = netdev_priv(dev); 186 struct veth_priv *priv = netdev_priv(dev);
@@ -179,18 +230,254 @@ static void veth_set_multicast_list(struct net_device *dev)
179{ 230{
180} 231}
181 232
233static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
234 int buflen)
235{
236 struct sk_buff *skb;
237
238 if (!buflen) {
239 buflen = SKB_DATA_ALIGN(headroom + len) +
240 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
241 }
242 skb = build_skb(head, buflen);
243 if (!skb)
244 return NULL;
245
246 skb_reserve(skb, headroom);
247 skb_put(skb, len);
248
249 return skb;
250}
251
252static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
253 struct sk_buff *skb)
254{
255 u32 pktlen, headroom, act, metalen;
256 void *orig_data, *orig_data_end;
257 struct bpf_prog *xdp_prog;
258 int mac_len, delta, off;
259 struct xdp_buff xdp;
260
261 rcu_read_lock();
262 xdp_prog = rcu_dereference(priv->xdp_prog);
263 if (unlikely(!xdp_prog)) {
264 rcu_read_unlock();
265 goto out;
266 }
267
268 mac_len = skb->data - skb_mac_header(skb);
269 pktlen = skb->len + mac_len;
270 headroom = skb_headroom(skb) - mac_len;
271
272 if (skb_shared(skb) || skb_head_is_locked(skb) ||
273 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
274 struct sk_buff *nskb;
275 int size, head_off;
276 void *head, *start;
277 struct page *page;
278
279 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
280 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
281 if (size > PAGE_SIZE)
282 goto drop;
283
284 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
285 if (!page)
286 goto drop;
287
288 head = page_address(page);
289 start = head + VETH_XDP_HEADROOM;
290 if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
291 page_frag_free(head);
292 goto drop;
293 }
294
295 nskb = veth_build_skb(head,
296 VETH_XDP_HEADROOM + mac_len, skb->len,
297 PAGE_SIZE);
298 if (!nskb) {
299 page_frag_free(head);
300 goto drop;
301 }
302
303 skb_copy_header(nskb, skb);
304 head_off = skb_headroom(nskb) - skb_headroom(skb);
305 skb_headers_offset_update(nskb, head_off);
306 if (skb->sk)
307 skb_set_owner_w(nskb, skb->sk);
308 consume_skb(skb);
309 skb = nskb;
310 }
311
312 xdp.data_hard_start = skb->head;
313 xdp.data = skb_mac_header(skb);
314 xdp.data_end = xdp.data + pktlen;
315 xdp.data_meta = xdp.data;
316 xdp.rxq = &priv->xdp_rxq;
317 orig_data = xdp.data;
318 orig_data_end = xdp.data_end;
319
320 act = bpf_prog_run_xdp(xdp_prog, &xdp);
321
322 switch (act) {
323 case XDP_PASS:
324 break;
325 default:
326 bpf_warn_invalid_xdp_action(act);
327 case XDP_ABORTED:
328 trace_xdp_exception(priv->dev, xdp_prog, act);
329 case XDP_DROP:
330 goto drop;
331 }
332 rcu_read_unlock();
333
334 delta = orig_data - xdp.data;
335 off = mac_len + delta;
336 if (off > 0)
337 __skb_push(skb, off);
338 else if (off < 0)
339 __skb_pull(skb, -off);
340 skb->mac_header -= delta;
341 off = xdp.data_end - orig_data_end;
342 if (off != 0)
343 __skb_put(skb, off);
344 skb->protocol = eth_type_trans(skb, priv->dev);
345
346 metalen = xdp.data - xdp.data_meta;
347 if (metalen)
348 skb_metadata_set(skb, metalen);
349out:
350 return skb;
351drop:
352 rcu_read_unlock();
353 kfree_skb(skb);
354 return NULL;
355}
356
357static int veth_xdp_rcv(struct veth_priv *priv, int budget)
358{
359 int i, done = 0;
360
361 for (i = 0; i < budget; i++) {
362 struct sk_buff *skb = __ptr_ring_consume(&priv->xdp_ring);
363
364 if (!skb)
365 break;
366
367 skb = veth_xdp_rcv_skb(priv, skb);
368
369 if (skb)
370 napi_gro_receive(&priv->xdp_napi, skb);
371
372 done++;
373 }
374
375 return done;
376}
377
378static int veth_poll(struct napi_struct *napi, int budget)
379{
380 struct veth_priv *priv =
381 container_of(napi, struct veth_priv, xdp_napi);
382 int done;
383
384 done = veth_xdp_rcv(priv, budget);
385
386 if (done < budget && napi_complete_done(napi, done)) {
387 /* Write rx_notify_masked before reading ptr_ring */
388 smp_store_mb(priv->rx_notify_masked, false);
389 if (unlikely(!__ptr_ring_empty(&priv->xdp_ring))) {
390 priv->rx_notify_masked = true;
391 napi_schedule(&priv->xdp_napi);
392 }
393 }
394
395 return done;
396}
397
398static int veth_napi_add(struct net_device *dev)
399{
400 struct veth_priv *priv = netdev_priv(dev);
401 int err;
402
403 err = ptr_ring_init(&priv->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
404 if (err)
405 return err;
406
407 netif_napi_add(dev, &priv->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
408 napi_enable(&priv->xdp_napi);
409
410 return 0;
411}
412
413static void veth_napi_del(struct net_device *dev)
414{
415 struct veth_priv *priv = netdev_priv(dev);
416
417 napi_disable(&priv->xdp_napi);
418 netif_napi_del(&priv->xdp_napi);
419 priv->rx_notify_masked = false;
420 ptr_ring_cleanup(&priv->xdp_ring, __skb_array_destroy_skb);
421}
422
423static int veth_enable_xdp(struct net_device *dev)
424{
425 struct veth_priv *priv = netdev_priv(dev);
426 int err;
427
428 if (!xdp_rxq_info_is_reg(&priv->xdp_rxq)) {
429 err = xdp_rxq_info_reg(&priv->xdp_rxq, dev, 0);
430 if (err < 0)
431 return err;
432
433 err = xdp_rxq_info_reg_mem_model(&priv->xdp_rxq,
434 MEM_TYPE_PAGE_SHARED, NULL);
435 if (err < 0)
436 goto err;
437
438 err = veth_napi_add(dev);
439 if (err)
440 goto err;
441 }
442
443 rcu_assign_pointer(priv->xdp_prog, priv->_xdp_prog);
444
445 return 0;
446err:
447 xdp_rxq_info_unreg(&priv->xdp_rxq);
448
449 return err;
450}
451
452static void veth_disable_xdp(struct net_device *dev)
453{
454 struct veth_priv *priv = netdev_priv(dev);
455
456 rcu_assign_pointer(priv->xdp_prog, NULL);
457 veth_napi_del(dev);
458 xdp_rxq_info_unreg(&priv->xdp_rxq);
459}
460
182static int veth_open(struct net_device *dev) 461static int veth_open(struct net_device *dev)
183{ 462{
184 struct veth_priv *priv = netdev_priv(dev); 463 struct veth_priv *priv = netdev_priv(dev);
185 struct net_device *peer = rtnl_dereference(priv->peer); 464 struct net_device *peer = rtnl_dereference(priv->peer);
465 int err;
186 466
187 if (!peer) 467 if (!peer)
188 return -ENOTCONN; 468 return -ENOTCONN;
189 469
470 if (priv->_xdp_prog) {
471 err = veth_enable_xdp(dev);
472 if (err)
473 return err;
474 }
475
190 if (peer->flags & IFF_UP) { 476 if (peer->flags & IFF_UP) {
191 netif_carrier_on(dev); 477 netif_carrier_on(dev);
192 netif_carrier_on(peer); 478 netif_carrier_on(peer);
193 } 479 }
480
194 return 0; 481 return 0;
195} 482}
196 483
@@ -203,6 +490,9 @@ static int veth_close(struct net_device *dev)
203 if (peer) 490 if (peer)
204 netif_carrier_off(peer); 491 netif_carrier_off(peer);
205 492
493 if (priv->_xdp_prog)
494 veth_disable_xdp(dev);
495
206 return 0; 496 return 0;
207} 497}
208 498
@@ -228,7 +518,7 @@ static void veth_dev_free(struct net_device *dev)
228static void veth_poll_controller(struct net_device *dev) 518static void veth_poll_controller(struct net_device *dev)
229{ 519{
230 /* veth only receives frames when its peer sends one 520 /* veth only receives frames when its peer sends one
231 * Since it's a synchronous operation, we are guaranteed 521 * Since it has nothing to do with disabling irqs, we are guaranteed
232 * never to have pending data when we poll for it so 522 * never to have pending data when we poll for it so
233 * there is nothing to do here. 523 * there is nothing to do here.
234 * 524 *
@@ -276,6 +566,72 @@ out:
276 rcu_read_unlock(); 566 rcu_read_unlock();
277} 567}
278 568
569static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
570 struct netlink_ext_ack *extack)
571{
572 struct veth_priv *priv = netdev_priv(dev);
573 struct bpf_prog *old_prog;
574 struct net_device *peer;
575 int err;
576
577 old_prog = priv->_xdp_prog;
578 priv->_xdp_prog = prog;
579 peer = rtnl_dereference(priv->peer);
580
581 if (prog) {
582 if (!peer) {
583 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
584 err = -ENOTCONN;
585 goto err;
586 }
587
588 if (dev->flags & IFF_UP) {
589 err = veth_enable_xdp(dev);
590 if (err) {
591 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
592 goto err;
593 }
594 }
595 }
596
597 if (old_prog) {
598 if (!prog && dev->flags & IFF_UP)
599 veth_disable_xdp(dev);
600 bpf_prog_put(old_prog);
601 }
602
603 return 0;
604err:
605 priv->_xdp_prog = old_prog;
606
607 return err;
608}
609
610static u32 veth_xdp_query(struct net_device *dev)
611{
612 struct veth_priv *priv = netdev_priv(dev);
613 const struct bpf_prog *xdp_prog;
614
615 xdp_prog = priv->_xdp_prog;
616 if (xdp_prog)
617 return xdp_prog->aux->id;
618
619 return 0;
620}
621
622static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
623{
624 switch (xdp->command) {
625 case XDP_SETUP_PROG:
626 return veth_xdp_set(dev, xdp->prog, xdp->extack);
627 case XDP_QUERY_PROG:
628 xdp->prog_id = veth_xdp_query(dev);
629 return 0;
630 default:
631 return -EINVAL;
632 }
633}
634
279static const struct net_device_ops veth_netdev_ops = { 635static const struct net_device_ops veth_netdev_ops = {
280 .ndo_init = veth_dev_init, 636 .ndo_init = veth_dev_init,
281 .ndo_open = veth_open, 637 .ndo_open = veth_open,
@@ -290,6 +646,7 @@ static const struct net_device_ops veth_netdev_ops = {
290 .ndo_get_iflink = veth_get_iflink, 646 .ndo_get_iflink = veth_get_iflink,
291 .ndo_features_check = passthru_features_check, 647 .ndo_features_check = passthru_features_check,
292 .ndo_set_rx_headroom = veth_set_rx_headroom, 648 .ndo_set_rx_headroom = veth_set_rx_headroom,
649 .ndo_bpf = veth_xdp,
293}; 650};
294 651
295#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 652#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -451,10 +808,13 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
451 */ 808 */
452 809
453 priv = netdev_priv(dev); 810 priv = netdev_priv(dev);
811 priv->dev = dev;
454 rcu_assign_pointer(priv->peer, peer); 812 rcu_assign_pointer(priv->peer, peer);
455 813
456 priv = netdev_priv(peer); 814 priv = netdev_priv(peer);
815 priv->dev = peer;
457 rcu_assign_pointer(priv->peer, dev); 816 rcu_assign_pointer(priv->peer, dev);
817
458 return 0; 818 return 0;
459 819
460err_register_dev: 820err_register_dev: