aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/dev.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2016-11-15 13:15:11 -0500
committerDavid S. Miller <davem@davemloft.net>2016-11-16 13:40:57 -0500
commit217f6974368188fd8bd7804bf5a036aa5762c5e4 (patch)
tree68a2c36862d18f378342a0e1678040b02495f92f /net/core/dev.c
parent2874aa2e467dbc0b4f7cb0ee5dc872e98e000a47 (diff)
net: busy-poll: allow preemption in sk_busy_loop()
After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"), sk_busy_loop() needs a bit of care : softirqs might be delayed since we do not allow preemption yet. This patch adds preemptiom points in sk_busy_loop(), and makes sure no unnecessary cache line dirtying or atomic operations are done while looping. A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED, so that sk_busy_loop() does not have to grab it again. Similarly, netpoll_poll_lock() is done one time. This gives about 10 to 20 % improvement in various busy polling tests, especially when many threads are busy polling in configurations with large number of NIC queues. This should allow experimenting with bigger delays without hurting overall latencies. Tested: On a 40Gb mlx4 NIC, 32 RX/TX queues. echo 70 >/proc/sys/net/core/busy_read for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done Before: After: 1: 90072 92819 2: 157289 184007 3: 235772 213504 4: 344074 357513 5: 394755 458267 6: 461151 487819 7: 549116 625963 8: 544423 716219 9: 720460 738446 10: 794686 837612 11: 915998 923960 12: 937507 925107 13: 1019677 971506 14: 1046831 1113650 15: 1114154 1148902 16: 1105221 1179263 17: 1266552 1299585 18: 1258454 1383817 19: 1341453 1312194 20: 1363557 1488487 21: 1387979 1501004 22: 1417552 1601683 23: 1550049 1642002 24: 1568876 1601915 25: 1560239 1683607 26: 1640207 1745211 27: 1706540 1723574 28: 1638518 1722036 29: 1734309 1757447 30: 1782007 1855436 31: 1724806 1888539 32: 1717716 1944297 33: 1778716 1869118 34: 1805738 1983466 35: 1815694 2020758 36: 1893059 2035632 37: 1843406 2034653 38: 1888830 2086580 39: 1972827 2143567 40: 1877729 2181851 Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Cc: Adam Belay <abelay@google.com> Cc: Tariq Toukan <tariqt@mellanox.com> Cc: Yuval Mintz <Yuval.Mintz@cavium.com> Cc: Ariel Elior <ariel.elior@cavium.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c102
1 files changed, 82 insertions, 20 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 6deba68ad9e4..369dcc8efc01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n)
4902{ 4902{
4903 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4903 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4904 4904
4905 /* Some drivers call us directly, instead of calling
4906 * napi_complete_done().
4907 */
4908 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4909 return;
4910
4905 list_del_init(&n->poll_list); 4911 list_del_init(&n->poll_list);
4906 smp_mb__before_atomic(); 4912 smp_mb__before_atomic();
4907 clear_bit(NAPI_STATE_SCHED, &n->state); 4913 clear_bit(NAPI_STATE_SCHED, &n->state);
@@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done)
4913 unsigned long flags; 4919 unsigned long flags;
4914 4920
4915 /* 4921 /*
4916 * don't let napi dequeue from the cpu poll list 4922 * 1) Don't let napi dequeue from the cpu poll list
4917 * just in case its running on a different cpu 4923 * just in case its running on a different cpu.
4924 * 2) If we are busy polling, do nothing here, we have
4925 * the guarantee we will be called later.
4918 */ 4926 */
4919 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4927 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4928 NAPIF_STATE_IN_BUSY_POLL)))
4920 return; 4929 return;
4921 4930
4922 if (n->gro_list) { 4931 if (n->gro_list) {
@@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
4956} 4965}
4957 4966
4958#if defined(CONFIG_NET_RX_BUSY_POLL) 4967#if defined(CONFIG_NET_RX_BUSY_POLL)
4968
4959#define BUSY_POLL_BUDGET 8 4969#define BUSY_POLL_BUDGET 8
4970
4971static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4972{
4973 int rc;
4974
4975 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4976
4977 local_bh_disable();
4978
4979 /* All we really want here is to re-enable device interrupts.
4980 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4981 */
4982 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4983 netpoll_poll_unlock(have_poll_lock);
4984 if (rc == BUSY_POLL_BUDGET)
4985 __napi_schedule(napi);
4986 local_bh_enable();
4987 if (local_softirq_pending())
4988 do_softirq();
4989}
4990
4960bool sk_busy_loop(struct sock *sk, int nonblock) 4991bool sk_busy_loop(struct sock *sk, int nonblock)
4961{ 4992{
4962 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 4993 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4994 int (*napi_poll)(struct napi_struct *napi, int budget);
4963 int (*busy_poll)(struct napi_struct *dev); 4995 int (*busy_poll)(struct napi_struct *dev);
4996 void *have_poll_lock = NULL;
4964 struct napi_struct *napi; 4997 struct napi_struct *napi;
4965 int rc = false; 4998 int rc;
4999
5000restart:
5001 rc = false;
5002 napi_poll = NULL;
4966 5003
4967 rcu_read_lock(); 5004 rcu_read_lock();
4968 5005
@@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
4973 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 5010 /* Note: ndo_busy_poll method is optional in linux-4.5 */
4974 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 5011 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4975 5012
4976 do { 5013 preempt_disable();
5014 for (;;) {
4977 rc = 0; 5015 rc = 0;
4978 local_bh_disable(); 5016 local_bh_disable();
4979 if (busy_poll) { 5017 if (busy_poll) {
4980 rc = busy_poll(napi); 5018 rc = busy_poll(napi);
4981 } else if (napi_schedule_prep(napi)) { 5019 goto count;
4982 void *have = netpoll_poll_lock(napi);
4983
4984 if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4985 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4986 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4987 if (rc == BUSY_POLL_BUDGET) {
4988 napi_complete_done(napi, rc);
4989 napi_schedule(napi);
4990 }
4991 }
4992 netpoll_poll_unlock(have);
4993 } 5020 }
5021 if (!napi_poll) {
5022 unsigned long val = READ_ONCE(napi->state);
5023
5024 /* If multiple threads are competing for this napi,
5025 * we avoid dirtying napi->state as much as we can.
5026 */
5027 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5028 NAPIF_STATE_IN_BUSY_POLL))
5029 goto count;
5030 if (cmpxchg(&napi->state, val,
5031 val | NAPIF_STATE_IN_BUSY_POLL |
5032 NAPIF_STATE_SCHED) != val)
5033 goto count;
5034 have_poll_lock = netpoll_poll_lock(napi);
5035 napi_poll = napi->poll;
5036 }
5037 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5038 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5039count:
4994 if (rc > 0) 5040 if (rc > 0)
4995 __NET_ADD_STATS(sock_net(sk), 5041 __NET_ADD_STATS(sock_net(sk),
4996 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5042 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
4999 if (rc == LL_FLUSH_FAILED) 5045 if (rc == LL_FLUSH_FAILED)
5000 break; /* permanent failure */ 5046 break; /* permanent failure */
5001 5047
5002 cpu_relax(); 5048 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5003 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 5049 busy_loop_timeout(end_time))
5004 !need_resched() && !busy_loop_timeout(end_time)); 5050 break;
5005 5051
5052 if (unlikely(need_resched())) {
5053 if (napi_poll)
5054 busy_poll_stop(napi, have_poll_lock);
5055 preempt_enable();
5056 rcu_read_unlock();
5057 cond_resched();
5058 rc = !skb_queue_empty(&sk->sk_receive_queue);
5059 if (rc || busy_loop_timeout(end_time))
5060 return rc;
5061 goto restart;
5062 }
5063 cpu_relax_lowlatency();
5064 }
5065 if (napi_poll)
5066 busy_poll_stop(napi, have_poll_lock);
5067 preempt_enable();
5006 rc = !skb_queue_empty(&sk->sk_receive_queue); 5068 rc = !skb_queue_empty(&sk->sk_receive_queue);
5007out: 5069out:
5008 rcu_read_unlock(); 5070 rcu_read_unlock();