diff options
author | Eric Dumazet <edumazet@google.com> | 2017-02-28 13:34:50 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-03-01 12:50:58 -0500 |
commit | 39e6c8208d7b6fb9d2047850fb3327db567b564b (patch) | |
tree | 3f0eb557e9d95e41ccfa5692dc513a7cd0590362 | |
parent | b2d0fe35471d1a71471f99147ffb5986bd60e744 (diff) |
net: solve a NAPI race
While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...
Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.
This would happen with a very low probability, but hurting RPC
workloads.
A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.
Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.
This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.
thread 1 thread 2 (could be on same cpu, or not)
// busy polling or napi_watchdog()
napi_schedule();
...
napi->poll()
device polling:
read 2 packets from ring buffer
Additional 3rd packet is
available.
device hard irq
// does nothing because
NAPI_STATE_SCHED bit is owned by thread 1
napi_schedule();
napi_complete_done(napi, 2);
rearm_irq();
Note that rearm_irq() will not force the device to send an additional
IRQ for the packet it already signaled (3rd packet in my example)
This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED
Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.
Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.
In v2, I changed napi_watchdog() to use a relaxed variant of
napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.
In v3, I added more details in the changelog and clears
NAPI_STATE_MISSED in busy_poll_stop()
In v4, I added the ideas given by Alexander Duyck in v3 review
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/netdevice.h | 29 | ||||
-rw-r--r-- | net/core/dev.c | 76 |
2 files changed, 81 insertions, 24 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f40f0ab3847a..97456b2539e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -330,6 +330,7 @@ struct napi_struct { | |||
330 | 330 | ||
331 | enum { | 331 | enum { |
332 | NAPI_STATE_SCHED, /* Poll is scheduled */ | 332 | NAPI_STATE_SCHED, /* Poll is scheduled */ |
333 | NAPI_STATE_MISSED, /* reschedule a napi */ | ||
333 | NAPI_STATE_DISABLE, /* Disable pending */ | 334 | NAPI_STATE_DISABLE, /* Disable pending */ |
334 | NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ | 335 | NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ |
335 | NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ | 336 | NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ |
@@ -338,12 +339,13 @@ enum { | |||
338 | }; | 339 | }; |
339 | 340 | ||
340 | enum { | 341 | enum { |
341 | NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), | 342 | NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), |
342 | NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), | 343 | NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), |
343 | NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), | 344 | NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), |
344 | NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), | 345 | NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), |
345 | NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), | 346 | NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), |
346 | NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), | 347 | NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), |
348 | NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), | ||
347 | }; | 349 | }; |
348 | 350 | ||
349 | enum gro_result { | 351 | enum gro_result { |
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n) | |||
414 | return test_bit(NAPI_STATE_DISABLE, &n->state); | 416 | return test_bit(NAPI_STATE_DISABLE, &n->state); |
415 | } | 417 | } |
416 | 418 | ||
417 | /** | 419 | bool napi_schedule_prep(struct napi_struct *n); |
418 | * napi_schedule_prep - check if NAPI can be scheduled | ||
419 | * @n: NAPI context | ||
420 | * | ||
421 | * Test if NAPI routine is already running, and if not mark | ||
422 | * it as running. This is used as a condition variable to | ||
423 | * insure only one NAPI poll instance runs. We also make | ||
424 | * sure there is no pending NAPI disable. | ||
425 | */ | ||
426 | static inline bool napi_schedule_prep(struct napi_struct *n) | ||
427 | { | ||
428 | return !napi_disable_pending(n) && | ||
429 | !test_and_set_bit(NAPI_STATE_SCHED, &n->state); | ||
430 | } | ||
431 | 420 | ||
432 | /** | 421 | /** |
433 | * napi_schedule - schedule NAPI poll | 422 | * napi_schedule - schedule NAPI poll |
diff --git a/net/core/dev.c b/net/core/dev.c index 304f2deae5f9..e63bf61b19be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -4884,6 +4884,39 @@ void __napi_schedule(struct napi_struct *n) | |||
4884 | EXPORT_SYMBOL(__napi_schedule); | 4884 | EXPORT_SYMBOL(__napi_schedule); |
4885 | 4885 | ||
4886 | /** | 4886 | /** |
4887 | * napi_schedule_prep - check if napi can be scheduled | ||
4888 | * @n: napi context | ||
4889 | * | ||
4890 | * Test if NAPI routine is already running, and if not mark | ||
4891 | * it as running. This is used as a condition variable | ||
4892 | * insure only one NAPI poll instance runs. We also make | ||
4893 | * sure there is no pending NAPI disable. | ||
4894 | */ | ||
4895 | bool napi_schedule_prep(struct napi_struct *n) | ||
4896 | { | ||
4897 | unsigned long val, new; | ||
4898 | |||
4899 | do { | ||
4900 | val = READ_ONCE(n->state); | ||
4901 | if (unlikely(val & NAPIF_STATE_DISABLE)) | ||
4902 | return false; | ||
4903 | new = val | NAPIF_STATE_SCHED; | ||
4904 | |||
4905 | /* Sets STATE_MISSED bit if STATE_SCHED was already set | ||
4906 | * This was suggested by Alexander Duyck, as compiler | ||
4907 | * emits better code than : | ||
4908 | * if (val & NAPIF_STATE_SCHED) | ||
4909 | * new |= NAPIF_STATE_MISSED; | ||
4910 | */ | ||
4911 | new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * | ||
4912 | NAPIF_STATE_MISSED; | ||
4913 | } while (cmpxchg(&n->state, val, new) != val); | ||
4914 | |||
4915 | return !(val & NAPIF_STATE_SCHED); | ||
4916 | } | ||
4917 | EXPORT_SYMBOL(napi_schedule_prep); | ||
4918 | |||
4919 | /** | ||
4887 | * __napi_schedule_irqoff - schedule for receive | 4920 | * __napi_schedule_irqoff - schedule for receive |
4888 | * @n: entry to schedule | 4921 | * @n: entry to schedule |
4889 | * | 4922 | * |
@@ -4897,7 +4930,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); | |||
4897 | 4930 | ||
4898 | bool napi_complete_done(struct napi_struct *n, int work_done) | 4931 | bool napi_complete_done(struct napi_struct *n, int work_done) |
4899 | { | 4932 | { |
4900 | unsigned long flags; | 4933 | unsigned long flags, val, new; |
4901 | 4934 | ||
4902 | /* | 4935 | /* |
4903 | * 1) Don't let napi dequeue from the cpu poll list | 4936 | * 1) Don't let napi dequeue from the cpu poll list |
@@ -4927,7 +4960,27 @@ bool napi_complete_done(struct napi_struct *n, int work_done) | |||
4927 | list_del_init(&n->poll_list); | 4960 | list_del_init(&n->poll_list); |
4928 | local_irq_restore(flags); | 4961 | local_irq_restore(flags); |
4929 | } | 4962 | } |
4930 | WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); | 4963 | |
4964 | do { | ||
4965 | val = READ_ONCE(n->state); | ||
4966 | |||
4967 | WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); | ||
4968 | |||
4969 | new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); | ||
4970 | |||
4971 | /* If STATE_MISSED was set, leave STATE_SCHED set, | ||
4972 | * because we will call napi->poll() one more time. | ||
4973 | * This C code was suggested by Alexander Duyck to help gcc. | ||
4974 | */ | ||
4975 | new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * | ||
4976 | NAPIF_STATE_SCHED; | ||
4977 | } while (cmpxchg(&n->state, val, new) != val); | ||
4978 | |||
4979 | if (unlikely(val & NAPIF_STATE_MISSED)) { | ||
4980 | __napi_schedule(n); | ||
4981 | return false; | ||
4982 | } | ||
4983 | |||
4931 | return true; | 4984 | return true; |
4932 | } | 4985 | } |
4933 | EXPORT_SYMBOL(napi_complete_done); | 4986 | EXPORT_SYMBOL(napi_complete_done); |
@@ -4953,6 +5006,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) | |||
4953 | { | 5006 | { |
4954 | int rc; | 5007 | int rc; |
4955 | 5008 | ||
5009 | /* Busy polling means there is a high chance device driver hard irq | ||
5010 | * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was | ||
5011 | * set in napi_schedule_prep(). | ||
5012 | * Since we are about to call napi->poll() once more, we can safely | ||
5013 | * clear NAPI_STATE_MISSED. | ||
5014 | * | ||
5015 | * Note: x86 could use a single "lock and ..." instruction | ||
5016 | * to perform these two clear_bit() | ||
5017 | */ | ||
5018 | clear_bit(NAPI_STATE_MISSED, &napi->state); | ||
4956 | clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); | 5019 | clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); |
4957 | 5020 | ||
4958 | local_bh_disable(); | 5021 | local_bh_disable(); |
@@ -5088,8 +5151,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) | |||
5088 | struct napi_struct *napi; | 5151 | struct napi_struct *napi; |
5089 | 5152 | ||
5090 | napi = container_of(timer, struct napi_struct, timer); | 5153 | napi = container_of(timer, struct napi_struct, timer); |
5091 | if (napi->gro_list) | 5154 | |
5092 | napi_schedule_irqoff(napi); | 5155 | /* Note : we use a relaxed variant of napi_schedule_prep() not setting |
5156 | * NAPI_STATE_MISSED, since we do not react to a device IRQ. | ||
5157 | */ | ||
5158 | if (napi->gro_list && !napi_disable_pending(napi) && | ||
5159 | !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) | ||
5160 | __napi_schedule_irqoff(napi); | ||
5093 | 5161 | ||
5094 | return HRTIMER_NORESTART; | 5162 | return HRTIMER_NORESTART; |
5095 | } | 5163 | } |