aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-02-28 13:34:50 -0500
committerDavid S. Miller <davem@davemloft.net>2017-03-01 12:50:58 -0500
commit39e6c8208d7b6fb9d2047850fb3327db567b564b (patch)
tree3f0eb557e9d95e41ccfa5692dc513a7cd0590362
parentb2d0fe35471d1a71471f99147ffb5986bd60e744 (diff)
net: solve a NAPI race
While playing with mlx4 hardware timestamping of RX packets, I found that some packets were received by TCP stack with a ~200 ms delay... Since the timestamp was provided by the NIC, and my probe was added in tcp_v4_rcv() while in BH handler, I was confident it was not a sender issue, or a drop in the network. This would happen with a very low probability, but hurting RPC workloads. A NAPI driver normally arms the IRQ after the napi_complete_done(), after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab it. Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit while IRQ are not disabled, we might have later an IRQ firing and finding this bit set, right before napi_complete_done() clears it. This can happen with busy polling users, or if gro_flush_timeout is used. But some other uses of napi_schedule() in drivers can cause this as well. thread 1 thread 2 (could be on same cpu, or not) // busy polling or napi_watchdog() napi_schedule(); ... napi->poll() device polling: read 2 packets from ring buffer Additional 3rd packet is available. device hard irq // does nothing because NAPI_STATE_SCHED bit is owned by thread 1 napi_schedule(); napi_complete_done(napi, 2); rearm_irq(); Note that rearm_irq() will not force the device to send an additional IRQ for the packet it already signaled (3rd packet in my example) This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep() can set if it could not grab NAPI_STATE_SCHED Then napi_complete_done() properly reschedules the napi to make sure we do not miss something. Since we manipulate multiple bits at once, use cmpxchg() like in sk_busy_loop() to provide proper transactions. In v2, I changed napi_watchdog() to use a relaxed variant of napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point. In v3, I added more details in the changelog and clears NAPI_STATE_MISSED in busy_poll_stop() In v4, I added the ideas given by Alexander Duyck in v3 review Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Alexander Duyck <alexander.duyck@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netdevice.h29
-rw-r--r--net/core/dev.c76
2 files changed, 81 insertions, 24 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f40f0ab3847a..97456b2539e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@ struct napi_struct {
330 330
331enum { 331enum {
332 NAPI_STATE_SCHED, /* Poll is scheduled */ 332 NAPI_STATE_SCHED, /* Poll is scheduled */
333 NAPI_STATE_MISSED, /* reschedule a napi */
333 NAPI_STATE_DISABLE, /* Disable pending */ 334 NAPI_STATE_DISABLE, /* Disable pending */
334 NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ 335 NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
335 NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ 336 NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
338}; 339};
339 340
340enum { 341enum {
341 NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), 342 NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
342 NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), 343 NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
343 NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), 344 NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
344 NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), 345 NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
345 NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), 346 NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
346 NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), 347 NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
348 NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
347}; 349};
348 350
349enum gro_result { 351enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
414 return test_bit(NAPI_STATE_DISABLE, &n->state); 416 return test_bit(NAPI_STATE_DISABLE, &n->state);
415} 417}
416 418
417/** 419bool napi_schedule_prep(struct napi_struct *n);
418 * napi_schedule_prep - check if NAPI can be scheduled
419 * @n: NAPI context
420 *
421 * Test if NAPI routine is already running, and if not mark
422 * it as running. This is used as a condition variable to
423 * insure only one NAPI poll instance runs. We also make
424 * sure there is no pending NAPI disable.
425 */
426static inline bool napi_schedule_prep(struct napi_struct *n)
427{
428 return !napi_disable_pending(n) &&
429 !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
430}
431 420
432/** 421/**
433 * napi_schedule - schedule NAPI poll 422 * napi_schedule - schedule NAPI poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 304f2deae5f9..e63bf61b19be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4884,6 +4884,39 @@ void __napi_schedule(struct napi_struct *n)
4884EXPORT_SYMBOL(__napi_schedule); 4884EXPORT_SYMBOL(__napi_schedule);
4885 4885
4886/** 4886/**
4887 * napi_schedule_prep - check if napi can be scheduled
4888 * @n: napi context
4889 *
4890 * Test if NAPI routine is already running, and if not mark
4891 * it as running. This is used as a condition variable
4892 * insure only one NAPI poll instance runs. We also make
4893 * sure there is no pending NAPI disable.
4894 */
4895bool napi_schedule_prep(struct napi_struct *n)
4896{
4897 unsigned long val, new;
4898
4899 do {
4900 val = READ_ONCE(n->state);
4901 if (unlikely(val & NAPIF_STATE_DISABLE))
4902 return false;
4903 new = val | NAPIF_STATE_SCHED;
4904
4905 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4906 * This was suggested by Alexander Duyck, as compiler
4907 * emits better code than :
4908 * if (val & NAPIF_STATE_SCHED)
4909 * new |= NAPIF_STATE_MISSED;
4910 */
4911 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4912 NAPIF_STATE_MISSED;
4913 } while (cmpxchg(&n->state, val, new) != val);
4914
4915 return !(val & NAPIF_STATE_SCHED);
4916}
4917EXPORT_SYMBOL(napi_schedule_prep);
4918
4919/**
4887 * __napi_schedule_irqoff - schedule for receive 4920 * __napi_schedule_irqoff - schedule for receive
4888 * @n: entry to schedule 4921 * @n: entry to schedule
4889 * 4922 *
@@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
4897 4930
4898bool napi_complete_done(struct napi_struct *n, int work_done) 4931bool napi_complete_done(struct napi_struct *n, int work_done)
4899{ 4932{
4900 unsigned long flags; 4933 unsigned long flags, val, new;
4901 4934
4902 /* 4935 /*
4903 * 1) Don't let napi dequeue from the cpu poll list 4936 * 1) Don't let napi dequeue from the cpu poll list
@@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
4927 list_del_init(&n->poll_list); 4960 list_del_init(&n->poll_list);
4928 local_irq_restore(flags); 4961 local_irq_restore(flags);
4929 } 4962 }
4930 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4963
4964 do {
4965 val = READ_ONCE(n->state);
4966
4967 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
4968
4969 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
4970
4971 /* If STATE_MISSED was set, leave STATE_SCHED set,
4972 * because we will call napi->poll() one more time.
4973 * This C code was suggested by Alexander Duyck to help gcc.
4974 */
4975 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
4976 NAPIF_STATE_SCHED;
4977 } while (cmpxchg(&n->state, val, new) != val);
4978
4979 if (unlikely(val & NAPIF_STATE_MISSED)) {
4980 __napi_schedule(n);
4981 return false;
4982 }
4983
4931 return true; 4984 return true;
4932} 4985}
4933EXPORT_SYMBOL(napi_complete_done); 4986EXPORT_SYMBOL(napi_complete_done);
@@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4953{ 5006{
4954 int rc; 5007 int rc;
4955 5008
5009 /* Busy polling means there is a high chance device driver hard irq
5010 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5011 * set in napi_schedule_prep().
5012 * Since we are about to call napi->poll() once more, we can safely
5013 * clear NAPI_STATE_MISSED.
5014 *
5015 * Note: x86 could use a single "lock and ..." instruction
5016 * to perform these two clear_bit()
5017 */
5018 clear_bit(NAPI_STATE_MISSED, &napi->state);
4956 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); 5019 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4957 5020
4958 local_bh_disable(); 5021 local_bh_disable();
@@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5088 struct napi_struct *napi; 5151 struct napi_struct *napi;
5089 5152
5090 napi = container_of(timer, struct napi_struct, timer); 5153 napi = container_of(timer, struct napi_struct, timer);
5091 if (napi->gro_list) 5154
5092 napi_schedule_irqoff(napi); 5155 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5156 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5157 */
5158 if (napi->gro_list && !napi_disable_pending(napi) &&
5159 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5160 __napi_schedule_irqoff(napi);
5093 5161
5094 return HRTIMER_NORESTART; 5162 return HRTIMER_NORESTART;
5095} 5163}