aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-01-17 02:46:17 -0500
committerDavid S. Miller <davem@davemloft.net>2014-01-17 02:46:17 -0500
commitcf84eb0b09c0f09b4c70a648b9dfeec78be61f07 (patch)
tree1d77e0acd1ff34398fba2fa211fe965dde712ba9
parent722e47d7929b40f58c2ad609429c7293e41ca5a8 (diff)
parentfbf28d78f54016faa7f0b68cf632ac739f2204f7 (diff)
Merge branch 'virtio_rx_merging'
Michael Dalton says: ==================== virtio-net: mergeable rx buffer size auto-tuning The virtio-net device currently uses aligned MTU-sized mergeable receive packet buffers. Network throughput for workloads with large average packet size can be improved by posting larger receive packet buffers. However, due to SKB truesize effects, posting large (e.g, PAGE_SIZE) buffers reduces the throughput of workloads that do not benefit from GRO and have no large inbound packets. This patchset introduces virtio-net mergeable buffer size auto-tuning, with buffer sizes ranging from aligned MTU-size to PAGE_SIZE. Packet buffer size is chosen based on a per-receive queue EWMA of incoming packet size. To unify mergeable receive buffer memory allocation and improve SKB frag coalescing, all mergeable buffer memory allocation is migrated to per-receive queue page frag allocators. The per-receive queue mergeable packet buffer size is exported via sysfs, and the network device sysfs layer has been extended to add support for device-specific per-receive queue sysfs attribute groups. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/virtio_net.c197
-rw-r--r--include/linux/netdevice.h35
-rw-r--r--lib/average.c6
-rw-r--r--net/core/dev.c12
-rw-r--r--net/core/net-sysfs.c50
-rw-r--r--net/core/sock.c4
6 files changed, 214 insertions, 90 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9bd70aa87bf7..d75f8edf4fb3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@
26#include <linux/if_vlan.h> 26#include <linux/if_vlan.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/cpu.h> 28#include <linux/cpu.h>
29#include <linux/average.h>
29 30
30static int napi_weight = NAPI_POLL_WEIGHT; 31static int napi_weight = NAPI_POLL_WEIGHT;
31module_param(napi_weight, int, 0444); 32module_param(napi_weight, int, 0444);
@@ -36,11 +37,18 @@ module_param(gso, bool, 0444);
36 37
37/* FIXME: MTU in config. */ 38/* FIXME: MTU in config. */
38#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) 39#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
39#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
40 sizeof(struct virtio_net_hdr_mrg_rxbuf), \
41 L1_CACHE_BYTES))
42#define GOOD_COPY_LEN 128 40#define GOOD_COPY_LEN 128
43 41
42/* Weight used for the RX packet size EWMA. The average packet size is used to
43 * determine the packet buffer size when refilling RX rings. As the entire RX
44 * ring may be refilled at once, the weight is chosen so that the EWMA will be
45 * insensitive to short-term, transient changes in packet size.
46 */
47#define RECEIVE_AVG_WEIGHT 64
48
49/* Minimum alignment for mergeable packet buffers. */
50#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)
51
44#define VIRTNET_DRIVER_VERSION "1.0.0" 52#define VIRTNET_DRIVER_VERSION "1.0.0"
45 53
46struct virtnet_stats { 54struct virtnet_stats {
@@ -75,6 +83,12 @@ struct receive_queue {
75 /* Chain pages by the private ptr. */ 83 /* Chain pages by the private ptr. */
76 struct page *pages; 84 struct page *pages;
77 85
86 /* Average packet length for mergeable receive buffers. */
87 struct ewma mrg_avg_pkt_len;
88
89 /* Page frag for packet buffer allocation. */
90 struct page_frag alloc_frag;
91
78 /* RX: fragments + linear part + virtio header */ 92 /* RX: fragments + linear part + virtio header */
79 struct scatterlist sg[MAX_SKB_FRAGS + 2]; 93 struct scatterlist sg[MAX_SKB_FRAGS + 2];
80 94
@@ -123,11 +137,6 @@ struct virtnet_info {
123 /* Lock for config space updates */ 137 /* Lock for config space updates */
124 struct mutex config_lock; 138 struct mutex config_lock;
125 139
126 /* Page_frag for GFP_KERNEL packet buffer allocation when we run
127 * low on memory.
128 */
129 struct page_frag alloc_frag;
130
131 /* Does the affinity hint is set for virtqueues? */ 140 /* Does the affinity hint is set for virtqueues? */
132 bool affinity_hint_set; 141 bool affinity_hint_set;
133 142
@@ -218,6 +227,24 @@ static void skb_xmit_done(struct virtqueue *vq)
218 netif_wake_subqueue(vi->dev, vq2txq(vq)); 227 netif_wake_subqueue(vi->dev, vq2txq(vq));
219} 228}
220 229
230static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
231{
232 unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
233 return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
234}
235
236static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
237{
238 return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);
239
240}
241
242static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
243{
244 unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
245 return (unsigned long)buf | (size - 1);
246}
247
221/* Called from bottom half context */ 248/* Called from bottom half context */
222static struct sk_buff *page_to_skb(struct receive_queue *rq, 249static struct sk_buff *page_to_skb(struct receive_queue *rq,
223 struct page *page, unsigned int offset, 250 struct page *page, unsigned int offset,
@@ -326,36 +353,33 @@ err:
326 353
327static struct sk_buff *receive_mergeable(struct net_device *dev, 354static struct sk_buff *receive_mergeable(struct net_device *dev,
328 struct receive_queue *rq, 355 struct receive_queue *rq,
329 void *buf, 356 unsigned long ctx,
330 unsigned int len) 357 unsigned int len)
331{ 358{
359 void *buf = mergeable_ctx_to_buf_address(ctx);
332 struct skb_vnet_hdr *hdr = buf; 360 struct skb_vnet_hdr *hdr = buf;
333 int num_buf = hdr->mhdr.num_buffers; 361 int num_buf = hdr->mhdr.num_buffers;
334 struct page *page = virt_to_head_page(buf); 362 struct page *page = virt_to_head_page(buf);
335 int offset = buf - page_address(page); 363 int offset = buf - page_address(page);
336 struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, 364 unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
337 MERGE_BUFFER_LEN); 365
366 struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
338 struct sk_buff *curr_skb = head_skb; 367 struct sk_buff *curr_skb = head_skb;
339 368
340 if (unlikely(!curr_skb)) 369 if (unlikely(!curr_skb))
341 goto err_skb; 370 goto err_skb;
342
343 while (--num_buf) { 371 while (--num_buf) {
344 int num_skb_frags; 372 int num_skb_frags;
345 373
346 buf = virtqueue_get_buf(rq->vq, &len); 374 ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
347 if (unlikely(!buf)) { 375 if (unlikely(!ctx)) {
348 pr_debug("%s: rx error: %d buffers out of %d missing\n", 376 pr_debug("%s: rx error: %d buffers out of %d missing\n",
349 dev->name, num_buf, hdr->mhdr.num_buffers); 377 dev->name, num_buf, hdr->mhdr.num_buffers);
350 dev->stats.rx_length_errors++; 378 dev->stats.rx_length_errors++;
351 goto err_buf; 379 goto err_buf;
352 } 380 }
353 if (unlikely(len > MERGE_BUFFER_LEN)) {
354 pr_debug("%s: rx error: merge buffer too long\n",
355 dev->name);
356 len = MERGE_BUFFER_LEN;
357 }
358 381
382 buf = mergeable_ctx_to_buf_address(ctx);
359 page = virt_to_head_page(buf); 383 page = virt_to_head_page(buf);
360 384
361 num_skb_frags = skb_shinfo(curr_skb)->nr_frags; 385 num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
@@ -372,35 +396,37 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
372 head_skb->truesize += nskb->truesize; 396 head_skb->truesize += nskb->truesize;
373 num_skb_frags = 0; 397 num_skb_frags = 0;
374 } 398 }
399 truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
375 if (curr_skb != head_skb) { 400 if (curr_skb != head_skb) {
376 head_skb->data_len += len; 401 head_skb->data_len += len;
377 head_skb->len += len; 402 head_skb->len += len;
378 head_skb->truesize += MERGE_BUFFER_LEN; 403 head_skb->truesize += truesize;
379 } 404 }
380 offset = buf - page_address(page); 405 offset = buf - page_address(page);
381 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { 406 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
382 put_page(page); 407 put_page(page);
383 skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, 408 skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
384 len, MERGE_BUFFER_LEN); 409 len, truesize);
385 } else { 410 } else {
386 skb_add_rx_frag(curr_skb, num_skb_frags, page, 411 skb_add_rx_frag(curr_skb, num_skb_frags, page,
387 offset, len, MERGE_BUFFER_LEN); 412 offset, len, truesize);
388 } 413 }
389 } 414 }
390 415
416 ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
391 return head_skb; 417 return head_skb;
392 418
393err_skb: 419err_skb:
394 put_page(page); 420 put_page(page);
395 while (--num_buf) { 421 while (--num_buf) {
396 buf = virtqueue_get_buf(rq->vq, &len); 422 ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
397 if (unlikely(!buf)) { 423 if (unlikely(!ctx)) {
398 pr_debug("%s: rx error: %d buffers missing\n", 424 pr_debug("%s: rx error: %d buffers missing\n",
399 dev->name, num_buf); 425 dev->name, num_buf);
400 dev->stats.rx_length_errors++; 426 dev->stats.rx_length_errors++;
401 break; 427 break;
402 } 428 }
403 page = virt_to_head_page(buf); 429 page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
404 put_page(page); 430 put_page(page);
405 } 431 }
406err_buf: 432err_buf:
@@ -420,17 +446,20 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
420 if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { 446 if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
421 pr_debug("%s: short packet %i\n", dev->name, len); 447 pr_debug("%s: short packet %i\n", dev->name, len);
422 dev->stats.rx_length_errors++; 448 dev->stats.rx_length_errors++;
423 if (vi->mergeable_rx_bufs) 449 if (vi->mergeable_rx_bufs) {
424 put_page(virt_to_head_page(buf)); 450 unsigned long ctx = (unsigned long)buf;
425 else if (vi->big_packets) 451 void *base = mergeable_ctx_to_buf_address(ctx);
452 put_page(virt_to_head_page(base));
453 } else if (vi->big_packets) {
426 give_pages(rq, buf); 454 give_pages(rq, buf);
427 else 455 } else {
428 dev_kfree_skb(buf); 456 dev_kfree_skb(buf);
457 }
429 return; 458 return;
430 } 459 }
431 460
432 if (vi->mergeable_rx_bufs) 461 if (vi->mergeable_rx_bufs)
433 skb = receive_mergeable(dev, rq, buf, len); 462 skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
434 else if (vi->big_packets) 463 else if (vi->big_packets)
435 skb = receive_big(dev, rq, buf, len); 464 skb = receive_big(dev, rq, buf, len);
436 else 465 else
@@ -571,28 +600,45 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
571 return err; 600 return err;
572} 601}
573 602
603static unsigned int get_mergeable_buf_len(struct ewma *avg_pkt_len)
604{
605 const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
606 unsigned int len;
607
608 len = hdr_len + clamp_t(unsigned int, ewma_read(avg_pkt_len),
609 GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
610 return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
611}
612
574static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) 613static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
575{ 614{
576 struct virtnet_info *vi = rq->vq->vdev->priv; 615 struct page_frag *alloc_frag = &rq->alloc_frag;
577 char *buf = NULL; 616 char *buf;
617 unsigned long ctx;
578 int err; 618 int err;
619 unsigned int len, hole;
579 620
580 if (gfp & __GFP_WAIT) { 621 len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
581 if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag, 622 if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
582 gfp)) {
583 buf = (char *)page_address(vi->alloc_frag.page) +
584 vi->alloc_frag.offset;
585 get_page(vi->alloc_frag.page);
586 vi->alloc_frag.offset += MERGE_BUFFER_LEN;
587 }
588 } else {
589 buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
590 }
591 if (!buf)
592 return -ENOMEM; 623 return -ENOMEM;
593 624
594 sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN); 625 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
595 err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp); 626 ctx = mergeable_buf_to_ctx(buf, len);
627 get_page(alloc_frag->page);
628 alloc_frag->offset += len;
629 hole = alloc_frag->size - alloc_frag->offset;
630 if (hole < len) {
631 /* To avoid internal fragmentation, if there is very likely not
632 * enough space for another buffer, add the remaining space to
633 * the current buffer. This extra space is not included in
634 * the truesize stored in ctx.
635 */
636 len += hole;
637 alloc_frag->offset += hole;
638 }
639
640 sg_init_one(rq->sg, buf, len);
641 err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
596 if (err < 0) 642 if (err < 0)
597 put_page(virt_to_head_page(buf)); 643 put_page(virt_to_head_page(buf));
598 644
@@ -612,6 +658,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
612 int err; 658 int err;
613 bool oom; 659 bool oom;
614 660
661 gfp |= __GFP_COLD;
615 do { 662 do {
616 if (vi->mergeable_rx_bufs) 663 if (vi->mergeable_rx_bufs)
617 err = add_recvbuf_mergeable(rq, gfp); 664 err = add_recvbuf_mergeable(rq, gfp);
@@ -1368,6 +1415,14 @@ static void free_receive_bufs(struct virtnet_info *vi)
1368 } 1415 }
1369} 1416}
1370 1417
1418static void free_receive_page_frags(struct virtnet_info *vi)
1419{
1420 int i;
1421 for (i = 0; i < vi->max_queue_pairs; i++)
1422 if (vi->rq[i].alloc_frag.page)
1423 put_page(vi->rq[i].alloc_frag.page);
1424}
1425
1371static void free_unused_bufs(struct virtnet_info *vi) 1426static void free_unused_bufs(struct virtnet_info *vi)
1372{ 1427{
1373 void *buf; 1428 void *buf;
@@ -1383,12 +1438,15 @@ static void free_unused_bufs(struct virtnet_info *vi)
1383 struct virtqueue *vq = vi->rq[i].vq; 1438 struct virtqueue *vq = vi->rq[i].vq;
1384 1439
1385 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 1440 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
1386 if (vi->mergeable_rx_bufs) 1441 if (vi->mergeable_rx_bufs) {
1387 put_page(virt_to_head_page(buf)); 1442 unsigned long ctx = (unsigned long)buf;
1388 else if (vi->big_packets) 1443 void *base = mergeable_ctx_to_buf_address(ctx);
1444 put_page(virt_to_head_page(base));
1445 } else if (vi->big_packets) {
1389 give_pages(&vi->rq[i], buf); 1446 give_pages(&vi->rq[i], buf);
1390 else 1447 } else {
1391 dev_kfree_skb(buf); 1448 dev_kfree_skb(buf);
1449 }
1392 } 1450 }
1393 } 1451 }
1394} 1452}
@@ -1496,6 +1554,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
1496 napi_weight); 1554 napi_weight);
1497 1555
1498 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); 1556 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
1557 ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
1499 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); 1558 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
1500 } 1559 }
1501 1560
@@ -1532,6 +1591,33 @@ err:
1532 return ret; 1591 return ret;
1533} 1592}
1534 1593
1594#ifdef CONFIG_SYSFS
1595static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
1596 struct rx_queue_attribute *attribute, char *buf)
1597{
1598 struct virtnet_info *vi = netdev_priv(queue->dev);
1599 unsigned int queue_index = get_netdev_rx_queue_index(queue);
1600 struct ewma *avg;
1601
1602 BUG_ON(queue_index >= vi->max_queue_pairs);
1603 avg = &vi->rq[queue_index].mrg_avg_pkt_len;
1604 return sprintf(buf, "%u\n", get_mergeable_buf_len(avg));
1605}
1606
1607static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
1608 __ATTR_RO(mergeable_rx_buffer_size);
1609
1610static struct attribute *virtio_net_mrg_rx_attrs[] = {
1611 &mergeable_rx_buffer_size_attribute.attr,
1612 NULL
1613};
1614
1615static const struct attribute_group virtio_net_mrg_rx_group = {
1616 .name = "virtio_net",
1617 .attrs = virtio_net_mrg_rx_attrs
1618};
1619#endif
1620
1535static int virtnet_probe(struct virtio_device *vdev) 1621static int virtnet_probe(struct virtio_device *vdev)
1536{ 1622{
1537 int i, err; 1623 int i, err;
@@ -1646,6 +1732,10 @@ static int virtnet_probe(struct virtio_device *vdev)
1646 if (err) 1732 if (err)
1647 goto free_stats; 1733 goto free_stats;
1648 1734
1735#ifdef CONFIG_SYSFS
1736 if (vi->mergeable_rx_bufs)
1737 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
1738#endif
1649 netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); 1739 netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
1650 netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); 1740 netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
1651 1741
@@ -1695,9 +1785,8 @@ free_recv_bufs:
1695 unregister_netdev(dev); 1785 unregister_netdev(dev);
1696free_vqs: 1786free_vqs:
1697 cancel_delayed_work_sync(&vi->refill); 1787 cancel_delayed_work_sync(&vi->refill);
1788 free_receive_page_frags(vi);
1698 virtnet_del_vqs(vi); 1789 virtnet_del_vqs(vi);
1699 if (vi->alloc_frag.page)
1700 put_page(vi->alloc_frag.page);
1701free_stats: 1790free_stats:
1702 free_percpu(vi->stats); 1791 free_percpu(vi->stats);
1703free: 1792free:
@@ -1714,6 +1803,8 @@ static void remove_vq_common(struct virtnet_info *vi)
1714 1803
1715 free_receive_bufs(vi); 1804 free_receive_bufs(vi);
1716 1805
1806 free_receive_page_frags(vi);
1807
1717 virtnet_del_vqs(vi); 1808 virtnet_del_vqs(vi);
1718} 1809}
1719 1810
@@ -1731,8 +1822,6 @@ static void virtnet_remove(struct virtio_device *vdev)
1731 unregister_netdev(vi->dev); 1822 unregister_netdev(vi->dev);
1732 1823
1733 remove_vq_common(vi); 1824 remove_vq_common(vi);
1734 if (vi->alloc_frag.page)
1735 put_page(vi->alloc_frag.page);
1736 1825
1737 flush_work(&vi->config_work); 1826 flush_work(&vi->config_work);
1738 1827
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d7668b881d08..e985231fe04b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
668bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, 668bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
669 u16 filter_id); 669 u16 filter_id);
670#endif 670#endif
671#endif /* CONFIG_RPS */
671 672
672/* This structure contains an instance of an RX queue. */ 673/* This structure contains an instance of an RX queue. */
673struct netdev_rx_queue { 674struct netdev_rx_queue {
675#ifdef CONFIG_RPS
674 struct rps_map __rcu *rps_map; 676 struct rps_map __rcu *rps_map;
675 struct rps_dev_flow_table __rcu *rps_flow_table; 677 struct rps_dev_flow_table __rcu *rps_flow_table;
678#endif
676 struct kobject kobj; 679 struct kobject kobj;
677 struct net_device *dev; 680 struct net_device *dev;
678} ____cacheline_aligned_in_smp; 681} ____cacheline_aligned_in_smp;
679#endif /* CONFIG_RPS */ 682
683/*
684 * RX queue sysfs structures and functions.
685 */
686struct rx_queue_attribute {
687 struct attribute attr;
688 ssize_t (*show)(struct netdev_rx_queue *queue,
689 struct rx_queue_attribute *attr, char *buf);
690 ssize_t (*store)(struct netdev_rx_queue *queue,
691 struct rx_queue_attribute *attr, const char *buf, size_t len);
692};
680 693
681#ifdef CONFIG_XPS 694#ifdef CONFIG_XPS
682/* 695/*
@@ -1313,7 +1326,7 @@ struct net_device {
1313 unicast) */ 1326 unicast) */
1314 1327
1315 1328
1316#ifdef CONFIG_RPS 1329#ifdef CONFIG_SYSFS
1317 struct netdev_rx_queue *_rx; 1330 struct netdev_rx_queue *_rx;
1318 1331
1319 /* Number of RX queues allocated at register_netdev() time */ 1332 /* Number of RX queues allocated at register_netdev() time */
@@ -1424,6 +1437,8 @@ struct net_device {
1424 struct device dev; 1437 struct device dev;
1425 /* space for optional device, statistics, and wireless sysfs groups */ 1438 /* space for optional device, statistics, and wireless sysfs groups */
1426 const struct attribute_group *sysfs_groups[4]; 1439 const struct attribute_group *sysfs_groups[4];
1440 /* space for optional per-rx queue attributes */
1441 const struct attribute_group *sysfs_rx_queue_group;
1427 1442
1428 /* rtnetlink link ops */ 1443 /* rtnetlink link ops */
1429 const struct rtnl_link_ops *rtnl_link_ops; 1444 const struct rtnl_link_ops *rtnl_link_ops;
@@ -2375,7 +2390,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev)
2375 2390
2376int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); 2391int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
2377 2392
2378#ifdef CONFIG_RPS 2393#ifdef CONFIG_SYSFS
2379int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); 2394int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
2380#else 2395#else
2381static inline int netif_set_real_num_rx_queues(struct net_device *dev, 2396static inline int netif_set_real_num_rx_queues(struct net_device *dev,
@@ -2394,7 +2409,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
2394 from_dev->real_num_tx_queues); 2409 from_dev->real_num_tx_queues);
2395 if (err) 2410 if (err)
2396 return err; 2411 return err;
2397#ifdef CONFIG_RPS 2412#ifdef CONFIG_SYSFS
2398 return netif_set_real_num_rx_queues(to_dev, 2413 return netif_set_real_num_rx_queues(to_dev,
2399 from_dev->real_num_rx_queues); 2414 from_dev->real_num_rx_queues);
2400#else 2415#else
@@ -2402,6 +2417,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
2402#endif 2417#endif
2403} 2418}
2404 2419
2420#ifdef CONFIG_SYSFS
2421static inline unsigned int get_netdev_rx_queue_index(
2422 struct netdev_rx_queue *queue)
2423{
2424 struct net_device *dev = queue->dev;
2425 int index = queue - dev->_rx;
2426
2427 BUG_ON(index >= dev->num_rx_queues);
2428 return index;
2429}
2430#endif
2431
2405#define DEFAULT_MAX_NUM_RSS_QUEUES (8) 2432#define DEFAULT_MAX_NUM_RSS_QUEUES (8)
2406int netif_get_num_default_rss_queues(void); 2433int netif_get_num_default_rss_queues(void);
2407 2434
diff --git a/lib/average.c b/lib/average.c
index 99a67e662b3c..114d1beae0c7 100644
--- a/lib/average.c
+++ b/lib/average.c
@@ -53,8 +53,10 @@ EXPORT_SYMBOL(ewma_init);
53 */ 53 */
54struct ewma *ewma_add(struct ewma *avg, unsigned long val) 54struct ewma *ewma_add(struct ewma *avg, unsigned long val)
55{ 55{
56 avg->internal = avg->internal ? 56 unsigned long internal = ACCESS_ONCE(avg->internal);
57 (((avg->internal << avg->weight) - avg->internal) + 57
58 ACCESS_ONCE(avg->internal) = internal ?
59 (((internal << avg->weight) - internal) +
58 (val << avg->factor)) >> avg->weight : 60 (val << avg->factor)) >> avg->weight :
59 (val << avg->factor); 61 (val << avg->factor);
60 return avg; 62 return avg;
diff --git a/net/core/dev.c b/net/core/dev.c
index f87bedd51eed..288df6232006 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2083,7 +2083,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2083} 2083}
2084EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2084EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2085 2085
2086#ifdef CONFIG_RPS 2086#ifdef CONFIG_SYSFS
2087/** 2087/**
2088 * netif_set_real_num_rx_queues - set actual number of RX queues used 2088 * netif_set_real_num_rx_queues - set actual number of RX queues used
2089 * @dev: Network device 2089 * @dev: Network device
@@ -5764,7 +5764,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5764} 5764}
5765EXPORT_SYMBOL(netif_stacked_transfer_operstate); 5765EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5766 5766
5767#ifdef CONFIG_RPS 5767#ifdef CONFIG_SYSFS
5768static int netif_alloc_rx_queues(struct net_device *dev) 5768static int netif_alloc_rx_queues(struct net_device *dev)
5769{ 5769{
5770 unsigned int i, count = dev->num_rx_queues; 5770 unsigned int i, count = dev->num_rx_queues;
@@ -6309,7 +6309,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6309 return NULL; 6309 return NULL;
6310 } 6310 }
6311 6311
6312#ifdef CONFIG_RPS 6312#ifdef CONFIG_SYSFS
6313 if (rxqs < 1) { 6313 if (rxqs < 1) {
6314 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 6314 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6315 return NULL; 6315 return NULL;
@@ -6365,7 +6365,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6365 if (netif_alloc_netdev_queues(dev)) 6365 if (netif_alloc_netdev_queues(dev))
6366 goto free_all; 6366 goto free_all;
6367 6367
6368#ifdef CONFIG_RPS 6368#ifdef CONFIG_SYSFS
6369 dev->num_rx_queues = rxqs; 6369 dev->num_rx_queues = rxqs;
6370 dev->real_num_rx_queues = rxqs; 6370 dev->real_num_rx_queues = rxqs;
6371 if (netif_alloc_rx_queues(dev)) 6371 if (netif_alloc_rx_queues(dev))
@@ -6385,7 +6385,7 @@ free_all:
6385free_pcpu: 6385free_pcpu:
6386 free_percpu(dev->pcpu_refcnt); 6386 free_percpu(dev->pcpu_refcnt);
6387 netif_free_tx_queues(dev); 6387 netif_free_tx_queues(dev);
6388#ifdef CONFIG_RPS 6388#ifdef CONFIG_SYSFS
6389 kfree(dev->_rx); 6389 kfree(dev->_rx);
6390#endif 6390#endif
6391 6391
@@ -6410,7 +6410,7 @@ void free_netdev(struct net_device *dev)
6410 release_net(dev_net(dev)); 6410 release_net(dev_net(dev));
6411 6411
6412 netif_free_tx_queues(dev); 6412 netif_free_tx_queues(dev);
6413#ifdef CONFIG_RPS 6413#ifdef CONFIG_SYSFS
6414 kfree(dev->_rx); 6414 kfree(dev->_rx);
6415#endif 6415#endif
6416 6416
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 49843bf7e43e..7eeadeecc5a2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -498,17 +498,7 @@ static struct attribute_group wireless_group = {
498#define net_class_groups NULL 498#define net_class_groups NULL
499#endif /* CONFIG_SYSFS */ 499#endif /* CONFIG_SYSFS */
500 500
501#ifdef CONFIG_RPS 501#ifdef CONFIG_SYSFS
502/*
503 * RX queue sysfs structures and functions.
504 */
505struct rx_queue_attribute {
506 struct attribute attr;
507 ssize_t (*show)(struct netdev_rx_queue *queue,
508 struct rx_queue_attribute *attr, char *buf);
509 ssize_t (*store)(struct netdev_rx_queue *queue,
510 struct rx_queue_attribute *attr, const char *buf, size_t len);
511};
512#define to_rx_queue_attr(_attr) container_of(_attr, \ 502#define to_rx_queue_attr(_attr) container_of(_attr, \
513 struct rx_queue_attribute, attr) 503 struct rx_queue_attribute, attr)
514 504
@@ -543,6 +533,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
543 .store = rx_queue_attr_store, 533 .store = rx_queue_attr_store,
544}; 534};
545 535
536#ifdef CONFIG_RPS
546static ssize_t show_rps_map(struct netdev_rx_queue *queue, 537static ssize_t show_rps_map(struct netdev_rx_queue *queue,
547 struct rx_queue_attribute *attribute, char *buf) 538 struct rx_queue_attribute *attribute, char *buf)
548{ 539{
@@ -718,16 +709,20 @@ static struct rx_queue_attribute rps_cpus_attribute =
718static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = 709static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
719 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, 710 __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
720 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); 711 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
712#endif /* CONFIG_RPS */
721 713
722static struct attribute *rx_queue_default_attrs[] = { 714static struct attribute *rx_queue_default_attrs[] = {
715#ifdef CONFIG_RPS
723 &rps_cpus_attribute.attr, 716 &rps_cpus_attribute.attr,
724 &rps_dev_flow_table_cnt_attribute.attr, 717 &rps_dev_flow_table_cnt_attribute.attr,
718#endif
725 NULL 719 NULL
726}; 720};
727 721
728static void rx_queue_release(struct kobject *kobj) 722static void rx_queue_release(struct kobject *kobj)
729{ 723{
730 struct netdev_rx_queue *queue = to_rx_queue(kobj); 724 struct netdev_rx_queue *queue = to_rx_queue(kobj);
725#ifdef CONFIG_RPS
731 struct rps_map *map; 726 struct rps_map *map;
732 struct rps_dev_flow_table *flow_table; 727 struct rps_dev_flow_table *flow_table;
733 728
@@ -743,6 +738,7 @@ static void rx_queue_release(struct kobject *kobj)
743 RCU_INIT_POINTER(queue->rps_flow_table, NULL); 738 RCU_INIT_POINTER(queue->rps_flow_table, NULL);
744 call_rcu(&flow_table->rcu, rps_dev_flow_table_release); 739 call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
745 } 740 }
741#endif
746 742
747 memset(kobj, 0, sizeof(*kobj)); 743 memset(kobj, 0, sizeof(*kobj));
748 dev_put(queue->dev); 744 dev_put(queue->dev);
@@ -763,25 +759,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
763 kobj->kset = net->queues_kset; 759 kobj->kset = net->queues_kset;
764 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, 760 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
765 "rx-%u", index); 761 "rx-%u", index);
766 if (error) { 762 if (error)
767 kobject_put(kobj); 763 goto exit;
768 return error; 764
765 if (net->sysfs_rx_queue_group) {
766 error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
767 if (error)
768 goto exit;
769 } 769 }
770 770
771 kobject_uevent(kobj, KOBJ_ADD); 771 kobject_uevent(kobj, KOBJ_ADD);
772 dev_hold(queue->dev); 772 dev_hold(queue->dev);
773 773
774 return error; 774 return error;
775exit:
776 kobject_put(kobj);
777 return error;
775} 778}
776#endif /* CONFIG_RPS */ 779#endif /* CONFIG_SYFS */
777 780
778int 781int
779net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) 782net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
780{ 783{
781#ifdef CONFIG_RPS 784#ifdef CONFIG_SYSFS
782 int i; 785 int i;
783 int error = 0; 786 int error = 0;
784 787
788#ifndef CONFIG_RPS
789 if (!net->sysfs_rx_queue_group)
790 return 0;
791#endif
785 for (i = old_num; i < new_num; i++) { 792 for (i = old_num; i < new_num; i++) {
786 error = rx_queue_add_kobject(net, i); 793 error = rx_queue_add_kobject(net, i);
787 if (error) { 794 if (error) {
@@ -790,8 +797,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
790 } 797 }
791 } 798 }
792 799
793 while (--i >= new_num) 800 while (--i >= new_num) {
801 if (net->sysfs_rx_queue_group)
802 sysfs_remove_group(&net->_rx[i].kobj,
803 net->sysfs_rx_queue_group);
794 kobject_put(&net->_rx[i].kobj); 804 kobject_put(&net->_rx[i].kobj);
805 }
795 806
796 return error; 807 return error;
797#else 808#else
@@ -1155,9 +1166,6 @@ static int register_queue_kobjects(struct net_device *net)
1155 NULL, &net->dev.kobj); 1166 NULL, &net->dev.kobj);
1156 if (!net->queues_kset) 1167 if (!net->queues_kset)
1157 return -ENOMEM; 1168 return -ENOMEM;
1158#endif
1159
1160#ifdef CONFIG_RPS
1161 real_rx = net->real_num_rx_queues; 1169 real_rx = net->real_num_rx_queues;
1162#endif 1170#endif
1163 real_tx = net->real_num_tx_queues; 1171 real_tx = net->real_num_tx_queues;
@@ -1184,7 +1192,7 @@ static void remove_queue_kobjects(struct net_device *net)
1184{ 1192{
1185 int real_rx = 0, real_tx = 0; 1193 int real_rx = 0, real_tx = 0;
1186 1194
1187#ifdef CONFIG_RPS 1195#ifdef CONFIG_SYSFS
1188 real_rx = net->real_num_rx_queues; 1196 real_rx = net->real_num_rx_queues;
1189#endif 1197#endif
1190 real_tx = net->real_num_tx_queues; 1198 real_tx = net->real_num_tx_queues;
diff --git a/net/core/sock.c b/net/core/sock.c
index 85ad6f0d3898..b3f7ee3008a0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1836,9 +1836,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1836 put_page(pfrag->page); 1836 put_page(pfrag->page);
1837 } 1837 }
1838 1838
1839 /* We restrict high order allocations to users that can afford to wait */ 1839 order = SKB_FRAG_PAGE_ORDER;
1840 order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1841
1842 do { 1840 do {
1843 gfp_t gfp = prio; 1841 gfp_t gfp = prio;
1844 1842