aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-09-25 03:30:58 -0400
committerIngo Molnar <mingo@kernel.org>2012-09-26 03:46:10 -0400
commita9b86fab4b0a36fc4cd2712a07259c2c0e769742 (patch)
treee4db58440018a52089e8d6b39160f753ab10df99
parent9b20aa63b8fc9a6a3b6831f4eae3621755e51211 (diff)
parent593d1006cdf710ab3469c0c37c184fea0bc3da97 (diff)
Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull v3.7 RCU commits from Paul E. McKenney: " 0. A fix for a latent bug that has been in RCU ever since the addition of CPU stall warnings. This bug results in false-positive stall warnings, but thus far only on embedded systems with severely cut-down userspace configurations. This fix is located on an rcu/urgent branch, with the rest of the commits based on top of it. This commit CCs stable. Given that the merge window is coming quite soon and given the small number of affected users, I do -not- recommend pushing it to 3.6, but the separate branch makes it easy to find if someone needs it. 1. Further reductions in latency spikes for huge systems, along with additional boot-time adaptation to the actual hardware. This is a large change, as it moves RCU grace-period initialization and cleanup, along with quiescent-state forcing, from softirq to a kthread. However, it appears to be in quite good shape (famous last words). Posted to LKML at https://lkml.org/lkml/2012/9/20/427. 2. Updates to documentation and rcutorture, the latter category including keeping statistics on CPU-hotplug latencies and fixing some initialization-time races. Posted to LKML at https://lkml.org/lkml/2012/8/30/193. 3. Miscellaneous fixes and improvements, posted to LKML at https://lkml.org/lkml/2012/8/30/199. 4. CPU-hotplug fixes and improvements, posted to LKML at https://lkml.org/lkml/2012/8/30/292 for first three and at https://lkml.org/lkml/2012/8/3/416. 5. Idle-loop fixes that were omitted on an earlier submission, posted to LKML at https://lkml.org/lkml/2012/8/30/251. " Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/RCU/checklist.txt6
-rw-r--r--Documentation/RCU/stallwarn.txt16
-rw-r--r--Documentation/RCU/trace.txt43
-rw-r--r--Documentation/RCU/whatisRCU.txt9
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--arch/alpha/kernel/process.c6
-rw-r--r--arch/alpha/kernel/smp.c1
-rw-r--r--arch/cris/kernel/process.c3
-rw-r--r--arch/frv/kernel/process.c3
-rw-r--r--arch/h8300/kernel/process.c3
-rw-r--r--arch/ia64/kernel/process.c3
-rw-r--r--arch/m32r/kernel/process.c3
-rw-r--r--arch/m68k/kernel/process.c3
-rw-r--r--arch/mn10300/kernel/process.c3
-rw-r--r--arch/parisc/kernel/process.c3
-rw-r--r--arch/score/kernel/process.c4
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/msr.c5
-rw-r--r--arch/xtensa/kernel/process.c3
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c250
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.h6
-rw-r--r--include/linux/interrupt.h2
-rw-r--r--include/linux/kthread.h11
-rw-r--r--include/linux/rcupdate.h6
-rw-r--r--include/linux/smpboot.h43
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/kthread.c185
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h10
-rw-r--r--kernel/rcutorture.c159
-rw-r--r--kernel/rcutree.c706
-rw-r--r--kernel/rcutree.h46
-rw-r--r--kernel/rcutree_plugin.h577
-rw-r--r--kernel/rcutree_trace.c22
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/smpboot.c233
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c111
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/watchdog.c263
-rw-r--r--lib/Kconfig.debug14
-rw-r--r--mm/kmemleak.c6
44 files changed, 1507 insertions, 1335 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index fc103d7a0474..cdb20d41a44a 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -310,6 +310,12 @@ over a rather long period of time, but improvements are always welcome!
310 code under the influence of preempt_disable(), you instead 310 code under the influence of preempt_disable(), you instead
311 need to use synchronize_irq() or synchronize_sched(). 311 need to use synchronize_irq() or synchronize_sched().
312 312
313 This same limitation also applies to synchronize_rcu_bh()
314 and synchronize_srcu(), as well as to the asynchronous and
315 expedited forms of the three primitives, namely call_rcu(),
316 call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
317 synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
318
31312. Any lock acquired by an RCU callback must be acquired elsewhere 31912. Any lock acquired by an RCU callback must be acquired elsewhere
314 with softirq disabled, e.g., via spin_lock_irqsave(), 320 with softirq disabled, e.g., via spin_lock_irqsave(),
315 spin_lock_bh(), etc. Failing to disable irq on a given 321 spin_lock_bh(), etc. Failing to disable irq on a given
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 523364e4e1f1..1927151b386b 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -99,7 +99,7 @@ In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99printed: 99printed:
100 100
101 INFO: rcu_preempt detected stall on CPU 101 INFO: rcu_preempt detected stall on CPU
102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer=-1 102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
103 (t=65000 jiffies) 103 (t=65000 jiffies)
104 104
105The "(64628 ticks this GP)" indicates that this CPU has taken more 105The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,13 @@ number between the two "/"s is the value of the nesting, which will
116be a small positive number if in the idle loop and a very large positive 116be a small positive number if in the idle loop and a very large positive
117number (as shown above) otherwise. 117number (as shown above) otherwise.
118 118
119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the 119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
120CPU is not in the process of trying to force itself into dyntick-idle 120not in the process of trying to force itself into dyntick-idle state, the
121state, the "." indicates that the CPU has not given up forcing RCU 121"." indicates that the CPU has not given up forcing RCU into dyntick-idle
122into dyntick-idle mode (it would be "H" otherwise), and the "timer=-1" 122mode (it would be "H" otherwise), and the "timer not pending" indicates
123indicates that the CPU has not recented forced RCU into dyntick-idle 123that the CPU has not recently forced RCU into dyntick-idle mode (it
124mode (it would otherwise indicate the number of microseconds remaining 124would otherwise indicate the number of microseconds remaining in this
125in this forced state). 125forced state).
126 126
127 127
128Multiple Warnings From One Stall 128Multiple Warnings From One Stall
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index f6f15ce39903..672d19083252 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -333,23 +333,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
333The output of "cat rcu/rcu_pending" looks as follows: 333The output of "cat rcu/rcu_pending" looks as follows:
334 334
335rcu_sched: 335rcu_sched:
336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741
337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792
338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629
339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723
340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110
341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456
342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834
343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888
344rcu_bh: 344rcu_bh:
345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314
346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180
347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
353 353
354As always, this is once again split into "rcu_sched" and "rcu_bh" 354As always, this is once again split into "rcu_sched" and "rcu_bh"
355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional 355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -377,17 +377,6 @@ o "gpc" is the number of times that an old grace period had
377o "gps" is the number of times that a new grace period had started, 377o "gps" is the number of times that a new grace period had started,
378 but this CPU was not yet aware of it. 378 but this CPU was not yet aware of it.
379 379
380o "nf" is the number of times that this CPU suspected that the
381 current grace period had run for too long, and thus needed to
382 be forced.
383
384 Please note that "forcing" consists of sending resched IPIs
385 to holdout CPUs. If that CPU really still is in an old RCU
386 read-side critical section, then we really do have to wait for it.
387 The assumption behing "forcing" is that the CPU is not still in
388 an old RCU read-side critical section, but has not yet responded
389 for some other reason.
390
391o "nn" is the number of times that this CPU needed nothing. Alert 380o "nn" is the number of times that this CPU needed nothing. Alert
392 readers will note that the rcu "nn" number for a given CPU very 381 readers will note that the rcu "nn" number for a given CPU very
393 closely matches the rcu_bh "np" number for that same CPU. This 382 closely matches the rcu_bh "np" number for that same CPU. This
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 69ee188515e7..bf0f6de2aa00 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -873,7 +873,7 @@ d. Do you need to treat NMI handlers, hardirq handlers,
873 and code segments with preemption disabled (whether 873 and code segments with preemption disabled (whether
874 via preempt_disable(), local_irq_save(), local_bh_disable(), 874 via preempt_disable(), local_irq_save(), local_bh_disable(),
875 or some other mechanism) as if they were explicit RCU readers? 875 or some other mechanism) as if they were explicit RCU readers?
876 If so, you need RCU-sched. 876 If so, RCU-sched is the only choice that will work for you.
877 877
878e. Do you need RCU grace periods to complete even in the face 878e. Do you need RCU grace periods to complete even in the face
879 of softirq monopolization of one or more of the CPUs? For 879 of softirq monopolization of one or more of the CPUs? For
@@ -884,7 +884,12 @@ f. Is your workload too update-intensive for normal use of
884 RCU, but inappropriate for other synchronization mechanisms? 884 RCU, but inappropriate for other synchronization mechanisms?
885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
886 886
887g. Otherwise, use RCU. 887g. Do you need read-side critical sections that are respected
888 even though they are in the middle of the idle loop, during
889 user-mode execution, or on an offlined CPU? If so, SRCU is the
890 only choice that will work for you.
891
892h. Otherwise, use RCU.
888 893
889Of course, this all assumes that you have determined that RCU is in fact 894Of course, this all assumes that you have determined that RCU is in fact
890the right tool for your job. 895the right tool for your job.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ad7e2e5088c1..55ada0471f93 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2385,6 +2385,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2385 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] 2385 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2386 Set timeout for RCU CPU stall warning messages. 2386 Set timeout for RCU CPU stall warning messages.
2387 2387
2388 rcutree.jiffies_till_first_fqs= [KNL,BOOT]
2389 Set delay from grace-period initialization to
2390 first attempt to force quiescent states.
2391 Units are jiffies, minimum value is zero,
2392 and maximum value is HZ.
2393
2394 rcutree.jiffies_till_next_fqs= [KNL,BOOT]
2395 Set delay between subsequent attempts to force
2396 quiescent states. Units are jiffies, minimum
2397 value is one, and maximum value is HZ.
2398
2388 rcutorture.fqs_duration= [KNL,BOOT] 2399 rcutorture.fqs_duration= [KNL,BOOT]
2389 Set duration of force_quiescent_state bursts. 2400 Set duration of force_quiescent_state bursts.
2390 2401
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index d6fde98b74b3..83638aa096d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -28,6 +28,7 @@
28#include <linux/tty.h> 28#include <linux/tty.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rcupdate.h>
31 32
32#include <asm/reg.h> 33#include <asm/reg.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
@@ -54,9 +55,12 @@ cpu_idle(void)
54 /* FIXME -- EV6 and LCA45 know how to power down 55 /* FIXME -- EV6 and LCA45 know how to power down
55 the CPU. */ 56 the CPU. */
56 57
58 rcu_idle_enter();
57 while (!need_resched()) 59 while (!need_resched())
58 cpu_relax(); 60 cpu_relax();
59 schedule(); 61
62 rcu_idle_exit();
63 schedule_preempt_disabled();
60 } 64 }
61} 65}
62 66
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 35ddc02bfa4a..a41ad90a97a6 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -166,6 +166,7 @@ smp_callin(void)
166 DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n", 166 DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
167 cpuid, current, current->active_mm)); 167 cpuid, current, current->active_mm));
168 168
169 preempt_disable();
169 /* Do nothing. */ 170 /* Do nothing. */
170 cpu_idle(); 171 cpu_idle();
171} 172}
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 66fd01728790..7f65be6f7f17 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/elfcore.h> 25#include <linux/elfcore.h>
26#include <linux/mqueue.h> 26#include <linux/mqueue.h>
27#include <linux/reboot.h> 27#include <linux/reboot.h>
28#include <linux/rcupdate.h>
28 29
29//#define DEBUG 30//#define DEBUG
30 31
@@ -74,6 +75,7 @@ void cpu_idle (void)
74{ 75{
75 /* endless idle loop with no priority at all */ 76 /* endless idle loop with no priority at all */
76 while (1) { 77 while (1) {
78 rcu_idle_enter();
77 while (!need_resched()) { 79 while (!need_resched()) {
78 void (*idle)(void); 80 void (*idle)(void);
79 /* 81 /*
@@ -86,6 +88,7 @@ void cpu_idle (void)
86 idle = default_idle; 88 idle = default_idle;
87 idle(); 89 idle();
88 } 90 }
91 rcu_idle_exit();
89 schedule_preempt_disabled(); 92 schedule_preempt_disabled();
90 } 93 }
91} 94}
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index ff95f50efea5..2eb7fa5bf9d8 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/rcupdate.h>
28 29
29#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -69,12 +70,14 @@ void cpu_idle(void)
69{ 70{
70 /* endless idle loop with no priority at all */ 71 /* endless idle loop with no priority at all */
71 while (1) { 72 while (1) {
73 rcu_idle_enter();
72 while (!need_resched()) { 74 while (!need_resched()) {
73 check_pgt_cache(); 75 check_pgt_cache();
74 76
75 if (!frv_dma_inprogress && idle) 77 if (!frv_dma_inprogress && idle)
76 idle(); 78 idle();
77 } 79 }
80 rcu_idle_exit();
78 81
79 schedule_preempt_disabled(); 82 schedule_preempt_disabled();
80 } 83 }
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 0e9c315be104..f153ed1a4c08 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -36,6 +36,7 @@
36#include <linux/reboot.h> 36#include <linux/reboot.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/rcupdate.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/traps.h> 42#include <asm/traps.h>
@@ -78,8 +79,10 @@ void (*idle)(void) = default_idle;
78void cpu_idle(void) 79void cpu_idle(void)
79{ 80{
80 while (1) { 81 while (1) {
82 rcu_idle_enter();
81 while (!need_resched()) 83 while (!need_resched())
82 idle(); 84 idle();
85 rcu_idle_exit();
83 schedule_preempt_disabled(); 86 schedule_preempt_disabled();
84 } 87 }
85} 88}
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index dd6fc1449741..3e316ec0b835 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -29,6 +29,7 @@
29#include <linux/kdebug.h> 29#include <linux/kdebug.h>
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/tracehook.h> 31#include <linux/tracehook.h>
32#include <linux/rcupdate.h>
32 33
33#include <asm/cpu.h> 34#include <asm/cpu.h>
34#include <asm/delay.h> 35#include <asm/delay.h>
@@ -279,6 +280,7 @@ cpu_idle (void)
279 280
280 /* endless idle loop with no priority at all */ 281 /* endless idle loop with no priority at all */
281 while (1) { 282 while (1) {
283 rcu_idle_enter();
282 if (can_do_pal_halt) { 284 if (can_do_pal_halt) {
283 current_thread_info()->status &= ~TS_POLLING; 285 current_thread_info()->status &= ~TS_POLLING;
284 /* 286 /*
@@ -309,6 +311,7 @@ cpu_idle (void)
309 normal_xtp(); 311 normal_xtp();
310#endif 312#endif
311 } 313 }
314 rcu_idle_exit();
312 schedule_preempt_disabled(); 315 schedule_preempt_disabled();
313 check_pgt_cache(); 316 check_pgt_cache();
314 if (cpu_is_offline(cpu)) 317 if (cpu_is_offline(cpu))
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 3a4a32b27208..384e63f3a4c4 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -26,6 +26,7 @@
26#include <linux/ptrace.h> 26#include <linux/ptrace.h>
27#include <linux/unistd.h> 27#include <linux/unistd.h>
28#include <linux/hardirq.h> 28#include <linux/hardirq.h>
29#include <linux/rcupdate.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
@@ -82,6 +83,7 @@ void cpu_idle (void)
82{ 83{
83 /* endless idle loop with no priority at all */ 84 /* endless idle loop with no priority at all */
84 while (1) { 85 while (1) {
86 rcu_idle_enter();
85 while (!need_resched()) { 87 while (!need_resched()) {
86 void (*idle)(void) = pm_idle; 88 void (*idle)(void) = pm_idle;
87 89
@@ -90,6 +92,7 @@ void cpu_idle (void)
90 92
91 idle(); 93 idle();
92 } 94 }
95 rcu_idle_exit();
93 schedule_preempt_disabled(); 96 schedule_preempt_disabled();
94 } 97 }
95} 98}
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index c488e3cfab53..ac2892e49c7c 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/init_task.h> 26#include <linux/init_task.h>
27#include <linux/mqueue.h> 27#include <linux/mqueue.h>
28#include <linux/rcupdate.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/traps.h> 31#include <asm/traps.h>
@@ -75,8 +76,10 @@ void cpu_idle(void)
75{ 76{
76 /* endless idle loop with no priority at all */ 77 /* endless idle loop with no priority at all */
77 while (1) { 78 while (1) {
79 rcu_idle_enter();
78 while (!need_resched()) 80 while (!need_resched())
79 idle(); 81 idle();
82 rcu_idle_exit();
80 schedule_preempt_disabled(); 83 schedule_preempt_disabled();
81 } 84 }
82} 85}
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 7dab0cd36466..e9cceba193b6 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/rcupdate.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <asm/pgtable.h> 30#include <asm/pgtable.h>
30#include <asm/io.h> 31#include <asm/io.h>
@@ -107,6 +108,7 @@ void cpu_idle(void)
107{ 108{
108 /* endless idle loop with no priority at all */ 109 /* endless idle loop with no priority at all */
109 for (;;) { 110 for (;;) {
111 rcu_idle_enter();
110 while (!need_resched()) { 112 while (!need_resched()) {
111 void (*idle)(void); 113 void (*idle)(void);
112 114
@@ -121,6 +123,7 @@ void cpu_idle(void)
121 } 123 }
122 idle(); 124 idle();
123 } 125 }
126 rcu_idle_exit();
124 127
125 schedule_preempt_disabled(); 128 schedule_preempt_disabled();
126 } 129 }
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 2c05a9292a81..8c6b6b6561f0 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -48,6 +48,7 @@
48#include <linux/unistd.h> 48#include <linux/unistd.h>
49#include <linux/kallsyms.h> 49#include <linux/kallsyms.h>
50#include <linux/uaccess.h> 50#include <linux/uaccess.h>
51#include <linux/rcupdate.h>
51 52
52#include <asm/io.h> 53#include <asm/io.h>
53#include <asm/asm-offsets.h> 54#include <asm/asm-offsets.h>
@@ -69,8 +70,10 @@ void cpu_idle(void)
69 70
70 /* endless idle loop with no priority at all */ 71 /* endless idle loop with no priority at all */
71 while (1) { 72 while (1) {
73 rcu_idle_enter();
72 while (!need_resched()) 74 while (!need_resched())
73 barrier(); 75 barrier();
76 rcu_idle_exit();
74 schedule_preempt_disabled(); 77 schedule_preempt_disabled();
75 check_pgt_cache(); 78 check_pgt_cache();
76 } 79 }
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index 2707023c7563..637970cfd3f4 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -27,6 +27,7 @@
27#include <linux/reboot.h> 27#include <linux/reboot.h>
28#include <linux/elfcore.h> 28#include <linux/elfcore.h>
29#include <linux/pm.h> 29#include <linux/pm.h>
30#include <linux/rcupdate.h>
30 31
31void (*pm_power_off)(void); 32void (*pm_power_off)(void);
32EXPORT_SYMBOL(pm_power_off); 33EXPORT_SYMBOL(pm_power_off);
@@ -50,9 +51,10 @@ void __noreturn cpu_idle(void)
50{ 51{
51 /* endless idle loop with no priority at all */ 52 /* endless idle loop with no priority at all */
52 while (1) { 53 while (1) {
54 rcu_idle_enter();
53 while (!need_resched()) 55 while (!need_resched())
54 barrier(); 56 barrier();
55 57 rcu_idle_exit();
56 schedule_preempt_disabled(); 58 schedule_preempt_disabled();
57 } 59 }
58} 60}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 39472dd2323f..60c78917190c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -199,12 +199,14 @@ static int __init cpuid_init(void)
199 goto out_chrdev; 199 goto out_chrdev;
200 } 200 }
201 cpuid_class->devnode = cpuid_devnode; 201 cpuid_class->devnode = cpuid_devnode;
202 get_online_cpus();
202 for_each_online_cpu(i) { 203 for_each_online_cpu(i) {
203 err = cpuid_device_create(i); 204 err = cpuid_device_create(i);
204 if (err != 0) 205 if (err != 0)
205 goto out_class; 206 goto out_class;
206 } 207 }
207 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 208 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
209 put_online_cpus();
208 210
209 err = 0; 211 err = 0;
210 goto out; 212 goto out;
@@ -214,6 +216,7 @@ out_class:
214 for_each_online_cpu(i) { 216 for_each_online_cpu(i) {
215 cpuid_device_destroy(i); 217 cpuid_device_destroy(i);
216 } 218 }
219 put_online_cpus();
217 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
218out_chrdev: 221out_chrdev:
219 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 222 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)
225{ 228{
226 int cpu = 0; 229 int cpu = 0;
227 230
231 get_online_cpus();
228 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
229 cpuid_device_destroy(cpu); 233 cpuid_device_destroy(cpu);
230 class_destroy(cpuid_class); 234 class_destroy(cpuid_class);
231 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 235 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
232 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 236 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
237 put_online_cpus();
233} 238}
234 239
235module_init(cpuid_init); 240module_init(cpuid_init);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f043..a7c5661f8496 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -257,12 +257,14 @@ static int __init msr_init(void)
257 goto out_chrdev; 257 goto out_chrdev;
258 } 258 }
259 msr_class->devnode = msr_devnode; 259 msr_class->devnode = msr_devnode;
260 get_online_cpus();
260 for_each_online_cpu(i) { 261 for_each_online_cpu(i) {
261 err = msr_device_create(i); 262 err = msr_device_create(i);
262 if (err != 0) 263 if (err != 0)
263 goto out_class; 264 goto out_class;
264 } 265 }
265 register_hotcpu_notifier(&msr_class_cpu_notifier); 266 register_hotcpu_notifier(&msr_class_cpu_notifier);
267 put_online_cpus();
266 268
267 err = 0; 269 err = 0;
268 goto out; 270 goto out;
@@ -271,6 +273,7 @@ out_class:
271 i = 0; 273 i = 0;
272 for_each_online_cpu(i) 274 for_each_online_cpu(i)
273 msr_device_destroy(i); 275 msr_device_destroy(i);
276 put_online_cpus();
274 class_destroy(msr_class); 277 class_destroy(msr_class);
275out_chrdev: 278out_chrdev:
276 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 279 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -281,11 +284,13 @@ out:
281static void __exit msr_exit(void) 284static void __exit msr_exit(void)
282{ 285{
283 int cpu = 0; 286 int cpu = 0;
287 get_online_cpus();
284 for_each_online_cpu(cpu) 288 for_each_online_cpu(cpu)
285 msr_device_destroy(cpu); 289 msr_device_destroy(cpu);
286 class_destroy(msr_class); 290 class_destroy(msr_class);
287 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 291 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
288 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 292 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
293 put_online_cpus();
289} 294}
290 295
291module_init(msr_init); 296module_init(msr_init);
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 2c8d6a3d250a..bc44311aa18c 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -31,6 +31,7 @@
31#include <linux/mqueue.h> 31#include <linux/mqueue.h>
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/rcupdate.h>
34 35
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -110,8 +111,10 @@ void cpu_idle(void)
110 111
111 /* endless idle loop with no priority at all */ 112 /* endless idle loop with no priority at all */
112 while (1) { 113 while (1) {
114 rcu_idle_enter();
113 while (!need_resched()) 115 while (!need_resched())
114 platform_idle(); 116 platform_idle();
117 rcu_idle_exit();
115 schedule_preempt_disabled(); 118 schedule_preempt_disabled();
116 } 119 }
117} 120}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 53589000fd07..8615d7cf7e01 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/smpboot.h>
45 46
46#include "ehca_classes.h" 47#include "ehca_classes.h"
47#include "ehca_irq.h" 48#include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
652 ehca_process_eq((struct ehca_shca*)data, 1); 653 ehca_process_eq((struct ehca_shca*)data, 1);
653} 654}
654 655
655static inline int find_next_online_cpu(struct ehca_comp_pool *pool) 656static int find_next_online_cpu(struct ehca_comp_pool *pool)
656{ 657{
657 int cpu; 658 int cpu;
658 unsigned long flags; 659 unsigned long flags;
@@ -662,17 +663,20 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
662 ehca_dmp(cpu_online_mask, cpumask_size(), ""); 663 ehca_dmp(cpu_online_mask, cpumask_size(), "");
663 664
664 spin_lock_irqsave(&pool->last_cpu_lock, flags); 665 spin_lock_irqsave(&pool->last_cpu_lock, flags);
665 cpu = cpumask_next(pool->last_cpu, cpu_online_mask); 666 do {
666 if (cpu >= nr_cpu_ids) 667 cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
667 cpu = cpumask_first(cpu_online_mask); 668 if (cpu >= nr_cpu_ids)
668 pool->last_cpu = cpu; 669 cpu = cpumask_first(cpu_online_mask);
670 pool->last_cpu = cpu;
671 } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
669 spin_unlock_irqrestore(&pool->last_cpu_lock, flags); 672 spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
670 673
671 return cpu; 674 return cpu;
672} 675}
673 676
674static void __queue_comp_task(struct ehca_cq *__cq, 677static void __queue_comp_task(struct ehca_cq *__cq,
675 struct ehca_cpu_comp_task *cct) 678 struct ehca_cpu_comp_task *cct,
679 struct task_struct *thread)
676{ 680{
677 unsigned long flags; 681 unsigned long flags;
678 682
@@ -683,7 +687,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
683 __cq->nr_callbacks++; 687 __cq->nr_callbacks++;
684 list_add_tail(&__cq->entry, &cct->cq_list); 688 list_add_tail(&__cq->entry, &cct->cq_list);
685 cct->cq_jobs++; 689 cct->cq_jobs++;
686 wake_up(&cct->wait_queue); 690 wake_up_process(thread);
687 } else 691 } else
688 __cq->nr_callbacks++; 692 __cq->nr_callbacks++;
689 693
@@ -695,6 +699,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
695{ 699{
696 int cpu_id; 700 int cpu_id;
697 struct ehca_cpu_comp_task *cct; 701 struct ehca_cpu_comp_task *cct;
702 struct task_struct *thread;
698 int cq_jobs; 703 int cq_jobs;
699 unsigned long flags; 704 unsigned long flags;
700 705
@@ -702,7 +707,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
702 BUG_ON(!cpu_online(cpu_id)); 707 BUG_ON(!cpu_online(cpu_id));
703 708
704 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); 709 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
705 BUG_ON(!cct); 710 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
711 BUG_ON(!cct || !thread);
706 712
707 spin_lock_irqsave(&cct->task_lock, flags); 713 spin_lock_irqsave(&cct->task_lock, flags);
708 cq_jobs = cct->cq_jobs; 714 cq_jobs = cct->cq_jobs;
@@ -710,28 +716,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
710 if (cq_jobs > 0) { 716 if (cq_jobs > 0) {
711 cpu_id = find_next_online_cpu(pool); 717 cpu_id = find_next_online_cpu(pool);
712 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); 718 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
713 BUG_ON(!cct); 719 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
720 BUG_ON(!cct || !thread);
714 } 721 }
715 722 __queue_comp_task(__cq, cct, thread);
716 __queue_comp_task(__cq, cct);
717} 723}
718 724
719static void run_comp_task(struct ehca_cpu_comp_task *cct) 725static void run_comp_task(struct ehca_cpu_comp_task *cct)
720{ 726{
721 struct ehca_cq *cq; 727 struct ehca_cq *cq;
722 unsigned long flags;
723
724 spin_lock_irqsave(&cct->task_lock, flags);
725 728
726 while (!list_empty(&cct->cq_list)) { 729 while (!list_empty(&cct->cq_list)) {
727 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); 730 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
728 spin_unlock_irqrestore(&cct->task_lock, flags); 731 spin_unlock_irq(&cct->task_lock);
729 732
730 comp_event_callback(cq); 733 comp_event_callback(cq);
731 if (atomic_dec_and_test(&cq->nr_events)) 734 if (atomic_dec_and_test(&cq->nr_events))
732 wake_up(&cq->wait_completion); 735 wake_up(&cq->wait_completion);
733 736
734 spin_lock_irqsave(&cct->task_lock, flags); 737 spin_lock_irq(&cct->task_lock);
735 spin_lock(&cq->task_lock); 738 spin_lock(&cq->task_lock);
736 cq->nr_callbacks--; 739 cq->nr_callbacks--;
737 if (!cq->nr_callbacks) { 740 if (!cq->nr_callbacks) {
@@ -740,159 +743,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
740 } 743 }
741 spin_unlock(&cq->task_lock); 744 spin_unlock(&cq->task_lock);
742 } 745 }
743
744 spin_unlock_irqrestore(&cct->task_lock, flags);
745} 746}
746 747
747static int comp_task(void *__cct) 748static void comp_task_park(unsigned int cpu)
748{ 749{
749 struct ehca_cpu_comp_task *cct = __cct; 750 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
750 int cql_empty; 751 struct ehca_cpu_comp_task *target;
751 DECLARE_WAITQUEUE(wait, current); 752 struct task_struct *thread;
752 753 struct ehca_cq *cq, *tmp;
753 set_current_state(TASK_INTERRUPTIBLE); 754 LIST_HEAD(list);
754 while (!kthread_should_stop()) {
755 add_wait_queue(&cct->wait_queue, &wait);
756
757 spin_lock_irq(&cct->task_lock);
758 cql_empty = list_empty(&cct->cq_list);
759 spin_unlock_irq(&cct->task_lock);
760 if (cql_empty)
761 schedule();
762 else
763 __set_current_state(TASK_RUNNING);
764
765 remove_wait_queue(&cct->wait_queue, &wait);
766 755
767 spin_lock_irq(&cct->task_lock); 756 spin_lock_irq(&cct->task_lock);
768 cql_empty = list_empty(&cct->cq_list); 757 cct->cq_jobs = 0;
769 spin_unlock_irq(&cct->task_lock); 758 cct->active = 0;
770 if (!cql_empty) 759 list_splice_init(&cct->cq_list, &list);
771 run_comp_task(__cct); 760 spin_unlock_irq(&cct->task_lock);
772 761
773 set_current_state(TASK_INTERRUPTIBLE); 762 cpu = find_next_online_cpu(pool);
763 target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
764 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
765 spin_lock_irq(&target->task_lock);
766 list_for_each_entry_safe(cq, tmp, &list, entry) {
767 list_del(&cq->entry);
768 __queue_comp_task(cq, target, thread);
774 } 769 }
775 __set_current_state(TASK_RUNNING); 770 spin_unlock_irq(&target->task_lock);
776
777 return 0;
778}
779
780static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
781 int cpu)
782{
783 struct ehca_cpu_comp_task *cct;
784
785 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
786 spin_lock_init(&cct->task_lock);
787 INIT_LIST_HEAD(&cct->cq_list);
788 init_waitqueue_head(&cct->wait_queue);
789 cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
790 "ehca_comp/%d", cpu);
791
792 return cct->task;
793} 771}
794 772
795static void destroy_comp_task(struct ehca_comp_pool *pool, 773static void comp_task_stop(unsigned int cpu, bool online)
796 int cpu)
797{ 774{
798 struct ehca_cpu_comp_task *cct; 775 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
799 struct task_struct *task;
800 unsigned long flags_cct;
801
802 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
803
804 spin_lock_irqsave(&cct->task_lock, flags_cct);
805 776
806 task = cct->task; 777 spin_lock_irq(&cct->task_lock);
807 cct->task = NULL;
808 cct->cq_jobs = 0; 778 cct->cq_jobs = 0;
809 779 cct->active = 0;
810 spin_unlock_irqrestore(&cct->task_lock, flags_cct); 780 WARN_ON(!list_empty(&cct->cq_list));
811 781 spin_unlock_irq(&cct->task_lock);
812 if (task)
813 kthread_stop(task);
814} 782}
815 783
816static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu) 784static int comp_task_should_run(unsigned int cpu)
817{ 785{
818 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); 786 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
819 LIST_HEAD(list);
820 struct ehca_cq *cq;
821 unsigned long flags_cct;
822
823 spin_lock_irqsave(&cct->task_lock, flags_cct);
824
825 list_splice_init(&cct->cq_list, &list);
826
827 while (!list_empty(&list)) {
828 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
829
830 list_del(&cq->entry);
831 __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
832 }
833
834 spin_unlock_irqrestore(&cct->task_lock, flags_cct);
835 787
788 return cct->cq_jobs;
836} 789}
837 790
838static int __cpuinit comp_pool_callback(struct notifier_block *nfb, 791static void comp_task(unsigned int cpu)
839 unsigned long action,
840 void *hcpu)
841{ 792{
842 unsigned int cpu = (unsigned long)hcpu; 793 struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
843 struct ehca_cpu_comp_task *cct; 794 int cql_empty;
844 795
845 switch (action) { 796 spin_lock_irq(&cct->task_lock);
846 case CPU_UP_PREPARE: 797 cql_empty = list_empty(&cct->cq_list);
847 case CPU_UP_PREPARE_FROZEN: 798 if (!cql_empty) {
848 ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu); 799 __set_current_state(TASK_RUNNING);
849 if (!create_comp_task(pool, cpu)) { 800 run_comp_task(cct);
850 ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
851 return notifier_from_errno(-ENOMEM);
852 }
853 break;
854 case CPU_UP_CANCELED:
855 case CPU_UP_CANCELED_FROZEN:
856 ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
857 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
858 kthread_bind(cct->task, cpumask_any(cpu_online_mask));
859 destroy_comp_task(pool, cpu);
860 break;
861 case CPU_ONLINE:
862 case CPU_ONLINE_FROZEN:
863 ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
864 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
865 kthread_bind(cct->task, cpu);
866 wake_up_process(cct->task);
867 break;
868 case CPU_DOWN_PREPARE:
869 case CPU_DOWN_PREPARE_FROZEN:
870 ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
871 break;
872 case CPU_DOWN_FAILED:
873 case CPU_DOWN_FAILED_FROZEN:
874 ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
875 break;
876 case CPU_DEAD:
877 case CPU_DEAD_FROZEN:
878 ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
879 destroy_comp_task(pool, cpu);
880 take_over_work(pool, cpu);
881 break;
882 } 801 }
883 802 spin_unlock_irq(&cct->task_lock);
884 return NOTIFY_OK;
885} 803}
886 804
887static struct notifier_block comp_pool_callback_nb __cpuinitdata = { 805static struct smp_hotplug_thread comp_pool_threads = {
888 .notifier_call = comp_pool_callback, 806 .thread_should_run = comp_task_should_run,
889 .priority = 0, 807 .thread_fn = comp_task,
808 .thread_comm = "ehca_comp/%u",
809 .cleanup = comp_task_stop,
810 .park = comp_task_park,
890}; 811};
891 812
892int ehca_create_comp_pool(void) 813int ehca_create_comp_pool(void)
893{ 814{
894 int cpu; 815 int cpu, ret = -ENOMEM;
895 struct task_struct *task;
896 816
897 if (!ehca_scaling_code) 817 if (!ehca_scaling_code)
898 return 0; 818 return 0;
@@ -905,38 +825,46 @@ int ehca_create_comp_pool(void)
905 pool->last_cpu = cpumask_any(cpu_online_mask); 825 pool->last_cpu = cpumask_any(cpu_online_mask);
906 826
907 pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); 827 pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
908 if (pool->cpu_comp_tasks == NULL) { 828 if (!pool->cpu_comp_tasks)
909 kfree(pool); 829 goto out_pool;
910 return -EINVAL;
911 }
912 830
913 for_each_online_cpu(cpu) { 831 pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
914 task = create_comp_task(pool, cpu); 832 if (!pool->cpu_comp_threads)
915 if (task) { 833 goto out_tasks;
916 kthread_bind(task, cpu); 834
917 wake_up_process(task); 835 for_each_present_cpu(cpu) {
918 } 836 struct ehca_cpu_comp_task *cct;
837
838 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
839 spin_lock_init(&cct->task_lock);
840 INIT_LIST_HEAD(&cct->cq_list);
919 } 841 }
920 842
921 register_hotcpu_notifier(&comp_pool_callback_nb); 843 comp_pool_threads.store = pool->cpu_comp_threads;
844 ret = smpboot_register_percpu_thread(&comp_pool_threads);
845 if (ret)
846 goto out_threads;
922 847
923 printk(KERN_INFO "eHCA scaling code enabled\n"); 848 pr_info("eHCA scaling code enabled\n");
849 return ret;
924 850
925 return 0; 851out_threads:
852 free_percpu(pool->cpu_comp_threads);
853out_tasks:
854 free_percpu(pool->cpu_comp_tasks);
855out_pool:
856 kfree(pool);
857 return ret;
926} 858}
927 859
928void ehca_destroy_comp_pool(void) 860void ehca_destroy_comp_pool(void)
929{ 861{
930 int i;
931
932 if (!ehca_scaling_code) 862 if (!ehca_scaling_code)
933 return; 863 return;
934 864
935 unregister_hotcpu_notifier(&comp_pool_callback_nb); 865 smpboot_unregister_percpu_thread(&comp_pool_threads);
936
937 for_each_online_cpu(i)
938 destroy_comp_task(pool, i);
939 866
867 free_percpu(pool->cpu_comp_threads);
940 free_percpu(pool->cpu_comp_tasks); 868 free_percpu(pool->cpu_comp_tasks);
941 kfree(pool); 869 kfree(pool);
942} 870}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
index 3346cb06cea6..5370199f08c7 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
58void ehca_process_eq(struct ehca_shca *shca, int is_irq); 58void ehca_process_eq(struct ehca_shca *shca, int is_irq);
59 59
60struct ehca_cpu_comp_task { 60struct ehca_cpu_comp_task {
61 wait_queue_head_t wait_queue;
62 struct list_head cq_list; 61 struct list_head cq_list;
63 struct task_struct *task;
64 spinlock_t task_lock; 62 spinlock_t task_lock;
65 int cq_jobs; 63 int cq_jobs;
64 int active;
66}; 65};
67 66
68struct ehca_comp_pool { 67struct ehca_comp_pool {
69 struct ehca_cpu_comp_task *cpu_comp_tasks; 68 struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
69 struct task_struct * __percpu *cpu_comp_threads;
70 int last_cpu; 70 int last_cpu;
71 spinlock_t last_cpu_lock; 71 spinlock_t last_cpu_lock;
72}; 72};
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c5f856a040b9..5e4e6170f43a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -430,6 +430,8 @@ enum
430 NR_SOFTIRQS 430 NR_SOFTIRQS
431}; 431};
432 432
433#define SOFTIRQ_STOP_IDLE_MASK (~(1 << RCU_SOFTIRQ))
434
433/* map softirq index to softirq name. update 'softirq_to_name' in 435/* map softirq index to softirq name. update 'softirq_to_name' in
434 * kernel/softirq.c when adding a new softirq. 436 * kernel/softirq.c when adding a new softirq.
435 */ 437 */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 22ccf9dee177..8d816646f766 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
14 kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) 14 kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
15 15
16 16
17struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
18 void *data,
19 unsigned int cpu,
20 const char *namefmt);
21
17/** 22/**
18 * kthread_run - create and wake a thread. 23 * kthread_run - create and wake a thread.
19 * @threadfn: the function to run until signal_pending(current). 24 * @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
34 39
35void kthread_bind(struct task_struct *k, unsigned int cpu); 40void kthread_bind(struct task_struct *k, unsigned int cpu);
36int kthread_stop(struct task_struct *k); 41int kthread_stop(struct task_struct *k);
37int kthread_should_stop(void); 42bool kthread_should_stop(void);
43bool kthread_should_park(void);
38bool kthread_freezable_should_stop(bool *was_frozen); 44bool kthread_freezable_should_stop(bool *was_frozen);
39void *kthread_data(struct task_struct *k); 45void *kthread_data(struct task_struct *k);
46int kthread_park(struct task_struct *k);
47void kthread_unpark(struct task_struct *k);
48void kthread_parkme(void);
40 49
41int kthreadd(void *unused); 50int kthreadd(void *unused);
42extern struct task_struct *kthreadd_task; 51extern struct task_struct *kthreadd_task;
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 115ead2b5155..0fbbd52e01f9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -210,14 +210,12 @@ extern void exit_rcu(void);
210 * to nest RCU_NONIDLE() wrappers, but the nesting level is currently 210 * to nest RCU_NONIDLE() wrappers, but the nesting level is currently
211 * quite limited. If deeper nesting is required, it will be necessary 211 * quite limited. If deeper nesting is required, it will be necessary
212 * to adjust DYNTICK_TASK_NESTING_VALUE accordingly. 212 * to adjust DYNTICK_TASK_NESTING_VALUE accordingly.
213 *
214 * This macro may be used from process-level code only.
215 */ 213 */
216#define RCU_NONIDLE(a) \ 214#define RCU_NONIDLE(a) \
217 do { \ 215 do { \
218 rcu_idle_exit(); \ 216 rcu_irq_enter(); \
219 do { a; } while (0); \ 217 do { a; } while (0); \
220 rcu_idle_enter(); \ 218 rcu_irq_exit(); \
221 } while (0) 219 } while (0)
222 220
223/* 221/*
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
new file mode 100644
index 000000000000..e0106d8581d3
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
1#ifndef _LINUX_SMPBOOT_H
2#define _LINUX_SMPBOOT_H
3
4#include <linux/types.h>
5
6struct task_struct;
7/* Cookie handed to the thread_fn*/
8struct smpboot_thread_data;
9
10/**
11 * struct smp_hotplug_thread - CPU hotplug related thread descriptor
12 * @store: Pointer to per cpu storage for the task pointers
13 * @list: List head for core management
14 * @thread_should_run: Check whether the thread should run or not. Called with
15 * preemption disabled.
16 * @thread_fn: The associated thread function
17 * @setup: Optional setup function, called when the thread gets
18 * operational the first time
19 * @cleanup: Optional cleanup function, called when the thread
20 * should stop (module exit)
21 * @park: Optional park function, called when the thread is
22 * parked (cpu offline)
23 * @unpark: Optional unpark function, called when the thread is
24 * unparked (cpu online)
25 * @thread_comm: The base name of the thread
26 */
27struct smp_hotplug_thread {
28 struct task_struct __percpu **store;
29 struct list_head list;
30 int (*thread_should_run)(unsigned int cpu);
31 void (*thread_fn)(unsigned int cpu);
32 void (*setup)(unsigned int cpu);
33 void (*cleanup)(unsigned int cpu, bool online);
34 void (*park)(unsigned int cpu);
35 void (*unpark)(unsigned int cpu);
36 const char *thread_comm;
37};
38
39int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
40void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
41int smpboot_thread_schedule(void);
42
43#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764c..e5602d32acb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
48obj-$(CONFIG_SMP) += smp.o 48obj-$(CONFIG_SMP) += smp.o
49obj-$(CONFIG_SMP) += smpboot.o
50ifneq ($(CONFIG_SMP),y) 49ifneq ($(CONFIG_SMP),y)
51obj-y += up.o 50obj-y += up.o
52endif 51endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14d32588cccd..e615dfbcf794 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
280 __func__, cpu); 280 __func__, cpu);
281 goto out_release; 281 goto out_release;
282 } 282 }
283 smpboot_park_threads(cpu);
283 284
284 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 285 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
285 if (err) { 286 if (err) {
286 /* CPU didn't die: tell everyone. Can't complain. */ 287 /* CPU didn't die: tell everyone. Can't complain. */
288 smpboot_unpark_threads(cpu);
287 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 289 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
288
289 goto out_release; 290 goto out_release;
290 } 291 }
291 BUG_ON(cpu_online(cpu)); 292 BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
354 goto out; 355 goto out;
355 } 356 }
356 357
358 ret = smpboot_create_threads(cpu);
359 if (ret)
360 goto out;
361
357 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 362 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
358 if (ret) { 363 if (ret) {
359 nr_calls--; 364 nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
368 goto out_notify; 373 goto out_notify;
369 BUG_ON(!cpu_online(cpu)); 374 BUG_ON(!cpu_online(cpu));
370 375
376 /* Wake the per cpu threads */
377 smpboot_unpark_threads(cpu);
378
371 /* Now call notifier in preparation. */ 379 /* Now call notifier in preparation. */
372 cpu_notify(CPU_ONLINE | mod, hcpu); 380 cpu_notify(CPU_ONLINE | mod, hcpu);
373 381
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b579af57ea10..146a6fa96825 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
37}; 37};
38 38
39struct kthread { 39struct kthread {
40 int should_stop; 40 unsigned long flags;
41 unsigned int cpu;
41 void *data; 42 void *data;
43 struct completion parked;
42 struct completion exited; 44 struct completion exited;
43}; 45};
44 46
47enum KTHREAD_BITS {
48 KTHREAD_IS_PER_CPU = 0,
49 KTHREAD_SHOULD_STOP,
50 KTHREAD_SHOULD_PARK,
51 KTHREAD_IS_PARKED,
52};
53
45#define to_kthread(tsk) \ 54#define to_kthread(tsk) \
46 container_of((tsk)->vfork_done, struct kthread, exited) 55 container_of((tsk)->vfork_done, struct kthread, exited)
47 56
@@ -52,13 +61,29 @@ struct kthread {
52 * and this will return true. You should then return, and your return 61 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop(). 62 * value will be passed through to kthread_stop().
54 */ 63 */
55int kthread_should_stop(void) 64bool kthread_should_stop(void)
56{ 65{
57 return to_kthread(current)->should_stop; 66 return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
58} 67}
59EXPORT_SYMBOL(kthread_should_stop); 68EXPORT_SYMBOL(kthread_should_stop);
60 69
61/** 70/**
71 * kthread_should_park - should this kthread park now?
72 *
73 * When someone calls kthread_park() on your kthread, it will be woken
74 * and this will return true. You should then do the necessary
75 * cleanup and call kthread_parkme()
76 *
77 * Similar to kthread_should_stop(), but this keeps the thread alive
78 * and in a park position. kthread_unpark() "restarts" the thread and
79 * calls the thread function again.
80 */
81bool kthread_should_park(void)
82{
83 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
84}
85
86/**
62 * kthread_freezable_should_stop - should this freezable kthread return now? 87 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen 88 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 * 89 *
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
96 return to_kthread(task)->data; 121 return to_kthread(task)->data;
97} 122}
98 123
124static void __kthread_parkme(struct kthread *self)
125{
126 __set_current_state(TASK_INTERRUPTIBLE);
127 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
128 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
129 complete(&self->parked);
130 schedule();
131 __set_current_state(TASK_INTERRUPTIBLE);
132 }
133 clear_bit(KTHREAD_IS_PARKED, &self->flags);
134 __set_current_state(TASK_RUNNING);
135}
136
137void kthread_parkme(void)
138{
139 __kthread_parkme(to_kthread(current));
140}
141
99static int kthread(void *_create) 142static int kthread(void *_create)
100{ 143{
101 /* Copy data: it's on kthread's stack */ 144 /* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
105 struct kthread self; 148 struct kthread self;
106 int ret; 149 int ret;
107 150
108 self.should_stop = 0; 151 self.flags = 0;
109 self.data = data; 152 self.data = data;
110 init_completion(&self.exited); 153 init_completion(&self.exited);
154 init_completion(&self.parked);
111 current->vfork_done = &self.exited; 155 current->vfork_done = &self.exited;
112 156
113 /* OK, tell user we're spawned, wait for stop or wakeup */ 157 /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
117 schedule(); 161 schedule();
118 162
119 ret = -EINTR; 163 ret = -EINTR;
120 if (!self.should_stop)
121 ret = threadfn(data);
122 164
165 if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
166 __kthread_parkme(&self);
167 ret = threadfn(data);
168 }
123 /* we can't just return, we must preserve "self" on stack */ 169 /* we can't just return, we must preserve "self" on stack */
124 do_exit(ret); 170 do_exit(ret);
125} 171}
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
172 * Returns a task_struct or ERR_PTR(-ENOMEM). 218 * Returns a task_struct or ERR_PTR(-ENOMEM).
173 */ 219 */
174struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 220struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
175 void *data, 221 void *data, int node,
176 int node,
177 const char namefmt[], 222 const char namefmt[],
178 ...) 223 ...)
179{ 224{
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
210} 255}
211EXPORT_SYMBOL(kthread_create_on_node); 256EXPORT_SYMBOL(kthread_create_on_node);
212 257
258static void __kthread_bind(struct task_struct *p, unsigned int cpu)
259{
260 /* It's safe because the task is inactive. */
261 do_set_cpus_allowed(p, cpumask_of(cpu));
262 p->flags |= PF_THREAD_BOUND;
263}
264
213/** 265/**
214 * kthread_bind - bind a just-created kthread to a cpu. 266 * kthread_bind - bind a just-created kthread to a cpu.
215 * @p: thread created by kthread_create(). 267 * @p: thread created by kthread_create().
@@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
226 WARN_ON(1); 278 WARN_ON(1);
227 return; 279 return;
228 } 280 }
229 281 __kthread_bind(p, cpu);
230 /* It's safe because the task is inactive. */
231 do_set_cpus_allowed(p, cpumask_of(cpu));
232 p->flags |= PF_THREAD_BOUND;
233} 282}
234EXPORT_SYMBOL(kthread_bind); 283EXPORT_SYMBOL(kthread_bind);
235 284
236/** 285/**
286 * kthread_create_on_cpu - Create a cpu bound kthread
287 * @threadfn: the function to run until signal_pending(current).
288 * @data: data ptr for @threadfn.
289 * @cpu: The cpu on which the thread should be bound,
290 * @namefmt: printf-style name for the thread. Format is restricted
291 * to "name.*%u". Code fills in cpu number.
292 *
293 * Description: This helper function creates and names a kernel thread
294 * The thread will be woken and put into park mode.
295 */
296struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
297 void *data, unsigned int cpu,
298 const char *namefmt)
299{
300 struct task_struct *p;
301
302 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
303 cpu);
304 if (IS_ERR(p))
305 return p;
306 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
307 to_kthread(p)->cpu = cpu;
308 /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
309 kthread_park(p);
310 return p;
311}
312
313static struct kthread *task_get_live_kthread(struct task_struct *k)
314{
315 struct kthread *kthread;
316
317 get_task_struct(k);
318 kthread = to_kthread(k);
319 /* It might have exited */
320 barrier();
321 if (k->vfork_done != NULL)
322 return kthread;
323 return NULL;
324}
325
326/**
327 * kthread_unpark - unpark a thread created by kthread_create().
328 * @k: thread created by kthread_create().
329 *
330 * Sets kthread_should_park() for @k to return false, wakes it, and
331 * waits for it to return. If the thread is marked percpu then its
332 * bound to the cpu again.
333 */
334void kthread_unpark(struct task_struct *k)
335{
336 struct kthread *kthread = task_get_live_kthread(k);
337
338 if (kthread) {
339 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
340 /*
341 * We clear the IS_PARKED bit here as we don't wait
342 * until the task has left the park code. So if we'd
343 * park before that happens we'd see the IS_PARKED bit
344 * which might be about to be cleared.
345 */
346 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
347 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
348 __kthread_bind(k, kthread->cpu);
349 wake_up_process(k);
350 }
351 }
352 put_task_struct(k);
353}
354
355/**
356 * kthread_park - park a thread created by kthread_create().
357 * @k: thread created by kthread_create().
358 *
359 * Sets kthread_should_park() for @k to return true, wakes it, and
360 * waits for it to return. This can also be called after kthread_create()
361 * instead of calling wake_up_process(): the thread will park without
362 * calling threadfn().
363 *
364 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
365 * If called by the kthread itself just the park bit is set.
366 */
367int kthread_park(struct task_struct *k)
368{
369 struct kthread *kthread = task_get_live_kthread(k);
370 int ret = -ENOSYS;
371
372 if (kthread) {
373 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
374 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
375 if (k != current) {
376 wake_up_process(k);
377 wait_for_completion(&kthread->parked);
378 }
379 }
380 ret = 0;
381 }
382 put_task_struct(k);
383 return ret;
384}
385
386/**
237 * kthread_stop - stop a thread created by kthread_create(). 387 * kthread_stop - stop a thread created by kthread_create().
238 * @k: thread created by kthread_create(). 388 * @k: thread created by kthread_create().
239 * 389 *
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
250 */ 400 */
251int kthread_stop(struct task_struct *k) 401int kthread_stop(struct task_struct *k)
252{ 402{
253 struct kthread *kthread; 403 struct kthread *kthread = task_get_live_kthread(k);
254 int ret; 404 int ret;
255 405
256 trace_sched_kthread_stop(k); 406 trace_sched_kthread_stop(k);
257 get_task_struct(k); 407 if (kthread) {
258 408 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
259 kthread = to_kthread(k); 409 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
260 barrier(); /* it might have exited */
261 if (k->vfork_done != NULL) {
262 kthread->should_stop = 1;
263 wake_up_process(k); 410 wake_up_process(k);
264 wait_for_completion(&kthread->exited); 411 wait_for_completion(&kthread->exited);
265 } 412 }
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4e6a61b15e86..29ca1c6da594 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h>
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h> 51#include <trace/events/rcu.h>
@@ -81,6 +82,9 @@ void __rcu_read_unlock(void)
81 } else { 82 } else {
82 barrier(); /* critical section before exit code. */ 83 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN; 84 t->rcu_read_lock_nesting = INT_MIN;
85#ifdef CONFIG_PROVE_RCU_DELAY
86 udelay(10); /* Make preemption more probable. */
87#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
84 barrier(); /* assign before ->rcu_read_unlock_special load */ 88 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 89 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t); 90 rcu_read_unlock_special(t);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 547b1fe5b052..e4c6a598d6f7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head,
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
61 if (rcu_dynticks_nesting) { 61 if (newval) {
62 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting)); 63 rcu_dynticks_nesting, newval));
64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); 67 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 68 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 69 struct task_struct *idle = idle_task(smp_processor_id());
69 70
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 71 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting)); 72 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 73 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 74 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm, 75 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */ 76 idle->pid, idle->comm); /* must be idle task! */
76 } 77 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 78 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
79 barrier();
80 rcu_dynticks_nesting = newval;
78} 81}
79 82
80/* 83/*
@@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval)
84void rcu_idle_enter(void) 87void rcu_idle_enter(void)
85{ 88{
86 unsigned long flags; 89 unsigned long flags;
87 long long oldval; 90 long long newval;
88 91
89 local_irq_save(flags); 92 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); 93 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 94 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE) 95 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0; 96 newval = 0;
95 else 97 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 98 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
97 rcu_idle_enter_common(oldval); 99 rcu_idle_enter_common(newval);
98 local_irq_restore(flags); 100 local_irq_restore(flags);
99} 101}
100EXPORT_SYMBOL_GPL(rcu_idle_enter); 102EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
105void rcu_irq_exit(void) 107void rcu_irq_exit(void)
106{ 108{
107 unsigned long flags; 109 unsigned long flags;
108 long long oldval; 110 long long newval;
109 111
110 local_irq_save(flags); 112 local_irq_save(flags);
111 oldval = rcu_dynticks_nesting; 113 newval = rcu_dynticks_nesting - 1;
112 rcu_dynticks_nesting--; 114 WARN_ON_ONCE(newval < 0);
113 WARN_ON_ONCE(rcu_dynticks_nesting < 0); 115 rcu_idle_enter_common(newval);
114 rcu_idle_enter_common(oldval);
115 local_irq_restore(flags); 116 local_irq_restore(flags);
116} 117}
118EXPORT_SYMBOL_GPL(rcu_irq_exit);
117 119
118/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 120/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
119static void rcu_idle_exit_common(long long oldval) 121static void rcu_idle_exit_common(long long oldval)
@@ -171,6 +173,7 @@ void rcu_irq_enter(void)
171 rcu_idle_exit_common(oldval); 173 rcu_idle_exit_common(oldval);
172 local_irq_restore(flags); 174 local_irq_restore(flags);
173} 175}
176EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 177
175#ifdef CONFIG_DEBUG_LOCK_ALLOC 178#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 179
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 918fd1e8509c..3d0190282204 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -278,7 +278,7 @@ static int rcu_boost(void)
278 rcu_preempt_ctrlblk.exp_tasks == NULL) 278 rcu_preempt_ctrlblk.exp_tasks == NULL)
279 return 0; /* Nothing to boost. */ 279 return 0; /* Nothing to boost. */
280 280
281 raw_local_irq_save(flags); 281 local_irq_save(flags);
282 282
283 /* 283 /*
284 * Recheck with irqs disabled: all tasks in need of boosting 284 * Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +287,7 @@ static int rcu_boost(void)
287 */ 287 */
288 if (rcu_preempt_ctrlblk.boost_tasks == NULL && 288 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
289 rcu_preempt_ctrlblk.exp_tasks == NULL) { 289 rcu_preempt_ctrlblk.exp_tasks == NULL) {
290 raw_local_irq_restore(flags); 290 local_irq_restore(flags);
291 return 0; 291 return 0;
292 } 292 }
293 293
@@ -317,7 +317,7 @@ static int rcu_boost(void)
317 t = container_of(tb, struct task_struct, rcu_node_entry); 317 t = container_of(tb, struct task_struct, rcu_node_entry);
318 rt_mutex_init_proxy_locked(&mtx, t); 318 rt_mutex_init_proxy_locked(&mtx, t);
319 t->rcu_boost_mutex = &mtx; 319 t->rcu_boost_mutex = &mtx;
320 raw_local_irq_restore(flags); 320 local_irq_restore(flags);
321 rt_mutex_lock(&mtx); 321 rt_mutex_lock(&mtx);
322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
323 323
@@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
991{ 991{
992 unsigned long flags; 992 unsigned long flags;
993 993
994 raw_local_irq_save(flags); 994 local_irq_save(flags);
995 rcp->qlen -= n; 995 rcp->qlen -= n;
996 raw_local_irq_restore(flags); 996 local_irq_restore(flags);
997} 997}
998 998
999/* 999/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 25b15033c61f..aaa7b9f3532a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre
53 53
54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
55static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
56static int stat_interval; /* Interval between stats, in seconds. */ 56static int stat_interval = 60; /* Interval between stats, in seconds. */
57 /* Defaults to "only at end of test". */ 57 /* Zero means "only at end of test". */
58static bool verbose; /* Print more debug info. */ 58static bool verbose; /* Print more debug info. */
59static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 59static bool test_no_idle_hz = true;
60 /* Test RCU support for tickless idle CPUs. */
60static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
61static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
62static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
119 120
120#define TORTURE_FLAG "-torture:" 121#define TORTURE_FLAG "-torture:"
121#define PRINTK_STRING(s) \ 122#define PRINTK_STRING(s) \
122 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 123 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
123#define VERBOSE_PRINTK_STRING(s) \ 124#define VERBOSE_PRINTK_STRING(s) \
124 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 125 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
125#define VERBOSE_PRINTK_ERRSTRING(s) \ 126#define VERBOSE_PRINTK_ERRSTRING(s) \
126 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 127 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
127 128
128static char printk_buf[4096]; 129static char printk_buf[4096];
129 130
@@ -176,8 +177,14 @@ static long n_rcu_torture_boosts;
176static long n_rcu_torture_timers; 177static long n_rcu_torture_timers;
177static long n_offline_attempts; 178static long n_offline_attempts;
178static long n_offline_successes; 179static long n_offline_successes;
180static unsigned long sum_offline;
181static int min_offline = -1;
182static int max_offline;
179static long n_online_attempts; 183static long n_online_attempts;
180static long n_online_successes; 184static long n_online_successes;
185static unsigned long sum_online;
186static int min_online = -1;
187static int max_online;
181static long n_barrier_attempts; 188static long n_barrier_attempts;
182static long n_barrier_successes; 189static long n_barrier_successes;
183static struct list_head rcu_torture_removed; 190static struct list_head rcu_torture_removed;
@@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
235 if (fullstop == FULLSTOP_DONTSTOP) 242 if (fullstop == FULLSTOP_DONTSTOP)
236 fullstop = FULLSTOP_SHUTDOWN; 243 fullstop = FULLSTOP_SHUTDOWN;
237 else 244 else
238 printk(KERN_WARNING /* but going down anyway, so... */ 245 pr_warn(/* but going down anyway, so... */
239 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 246 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
240 mutex_unlock(&fullstop_mutex); 247 mutex_unlock(&fullstop_mutex);
241 return NOTIFY_DONE; 248 return NOTIFY_DONE;
@@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
248static void rcutorture_shutdown_absorb(char *title) 255static void rcutorture_shutdown_absorb(char *title)
249{ 256{
250 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 257 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
251 printk(KERN_NOTICE 258 pr_notice(
252 "rcutorture thread %s parking due to system shutdown\n", 259 "rcutorture thread %s parking due to system shutdown\n",
253 title); 260 title);
254 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); 261 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -1214,11 +1221,13 @@ rcu_torture_printk(char *page)
1214 n_rcu_torture_boost_failure, 1221 n_rcu_torture_boost_failure,
1215 n_rcu_torture_boosts, 1222 n_rcu_torture_boosts,
1216 n_rcu_torture_timers); 1223 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", 1224 cnt += sprintf(&page[cnt],
1218 n_online_successes, 1225 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1219 n_online_attempts, 1226 n_online_successes, n_online_attempts,
1220 n_offline_successes, 1227 n_offline_successes, n_offline_attempts,
1221 n_offline_attempts); 1228 min_online, max_online,
1229 min_offline, max_offline,
1230 sum_online, sum_offline, HZ);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", 1231 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1223 n_barrier_successes, 1232 n_barrier_successes,
1224 n_barrier_attempts, 1233 n_barrier_attempts,
@@ -1267,7 +1276,7 @@ rcu_torture_stats_print(void)
1267 int cnt; 1276 int cnt;
1268 1277
1269 cnt = rcu_torture_printk(printk_buf); 1278 cnt = rcu_torture_printk(printk_buf);
1270 printk(KERN_ALERT "%s", printk_buf); 1279 pr_alert("%s", printk_buf);
1271} 1280}
1272 1281
1273/* 1282/*
@@ -1380,20 +1389,20 @@ rcu_torture_stutter(void *arg)
1380static inline void 1389static inline void
1381rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1390rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1382{ 1391{
1383 printk(KERN_ALERT "%s" TORTURE_FLAG 1392 pr_alert("%s" TORTURE_FLAG
1384 "--- %s: nreaders=%d nfakewriters=%d " 1393 "--- %s: nreaders=%d nfakewriters=%d "
1385 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1394 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1386 "shuffle_interval=%d stutter=%d irqreader=%d " 1395 "shuffle_interval=%d stutter=%d irqreader=%d "
1387 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1388 "test_boost=%d/%d test_boost_interval=%d " 1397 "test_boost=%d/%d test_boost_interval=%d "
1389 "test_boost_duration=%d shutdown_secs=%d " 1398 "test_boost_duration=%d shutdown_secs=%d "
1390 "onoff_interval=%d onoff_holdoff=%d\n", 1399 "onoff_interval=%d onoff_holdoff=%d\n",
1391 torture_type, tag, nrealreaders, nfakewriters, 1400 torture_type, tag, nrealreaders, nfakewriters,
1392 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1393 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1394 test_boost, cur_ops->can_boost, 1403 test_boost, cur_ops->can_boost,
1395 test_boost_interval, test_boost_duration, shutdown_secs, 1404 test_boost_interval, test_boost_duration, shutdown_secs,
1396 onoff_interval, onoff_holdoff); 1405 onoff_interval, onoff_holdoff);
1397} 1406}
1398 1407
1399static struct notifier_block rcutorture_shutdown_nb = { 1408static struct notifier_block rcutorture_shutdown_nb = {
@@ -1460,9 +1469,9 @@ rcu_torture_shutdown(void *arg)
1460 !kthread_should_stop()) { 1469 !kthread_should_stop()) {
1461 delta = shutdown_time - jiffies_snap; 1470 delta = shutdown_time - jiffies_snap;
1462 if (verbose) 1471 if (verbose)
1463 printk(KERN_ALERT "%s" TORTURE_FLAG 1472 pr_alert("%s" TORTURE_FLAG
1464 "rcu_torture_shutdown task: %lu jiffies remaining\n", 1473 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1465 torture_type, delta); 1474 torture_type, delta);
1466 schedule_timeout_interruptible(delta); 1475 schedule_timeout_interruptible(delta);
1467 jiffies_snap = ACCESS_ONCE(jiffies); 1476 jiffies_snap = ACCESS_ONCE(jiffies);
1468 } 1477 }
@@ -1490,8 +1499,10 @@ static int __cpuinit
1490rcu_torture_onoff(void *arg) 1499rcu_torture_onoff(void *arg)
1491{ 1500{
1492 int cpu; 1501 int cpu;
1502 unsigned long delta;
1493 int maxcpu = -1; 1503 int maxcpu = -1;
1494 DEFINE_RCU_RANDOM(rand); 1504 DEFINE_RCU_RANDOM(rand);
1505 unsigned long starttime;
1495 1506
1496 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1497 for_each_online_cpu(cpu) 1508 for_each_online_cpu(cpu)
@@ -1506,29 +1517,51 @@ rcu_torture_onoff(void *arg)
1506 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1517 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1507 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1518 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1508 if (verbose) 1519 if (verbose)
1509 printk(KERN_ALERT "%s" TORTURE_FLAG 1520 pr_alert("%s" TORTURE_FLAG
1510 "rcu_torture_onoff task: offlining %d\n", 1521 "rcu_torture_onoff task: offlining %d\n",
1511 torture_type, cpu); 1522 torture_type, cpu);
1523 starttime = jiffies;
1512 n_offline_attempts++; 1524 n_offline_attempts++;
1513 if (cpu_down(cpu) == 0) { 1525 if (cpu_down(cpu) == 0) {
1514 if (verbose) 1526 if (verbose)
1515 printk(KERN_ALERT "%s" TORTURE_FLAG 1527 pr_alert("%s" TORTURE_FLAG
1516 "rcu_torture_onoff task: offlined %d\n", 1528 "rcu_torture_onoff task: offlined %d\n",
1517 torture_type, cpu); 1529 torture_type, cpu);
1518 n_offline_successes++; 1530 n_offline_successes++;
1531 delta = jiffies - starttime;
1532 sum_offline += delta;
1533 if (min_offline < 0) {
1534 min_offline = delta;
1535 max_offline = delta;
1536 }
1537 if (min_offline > delta)
1538 min_offline = delta;
1539 if (max_offline < delta)
1540 max_offline = delta;
1519 } 1541 }
1520 } else if (cpu_is_hotpluggable(cpu)) { 1542 } else if (cpu_is_hotpluggable(cpu)) {
1521 if (verbose) 1543 if (verbose)
1522 printk(KERN_ALERT "%s" TORTURE_FLAG 1544 pr_alert("%s" TORTURE_FLAG
1523 "rcu_torture_onoff task: onlining %d\n", 1545 "rcu_torture_onoff task: onlining %d\n",
1524 torture_type, cpu); 1546 torture_type, cpu);
1547 starttime = jiffies;
1525 n_online_attempts++; 1548 n_online_attempts++;
1526 if (cpu_up(cpu) == 0) { 1549 if (cpu_up(cpu) == 0) {
1527 if (verbose) 1550 if (verbose)
1528 printk(KERN_ALERT "%s" TORTURE_FLAG 1551 pr_alert("%s" TORTURE_FLAG
1529 "rcu_torture_onoff task: onlined %d\n", 1552 "rcu_torture_onoff task: onlined %d\n",
1530 torture_type, cpu); 1553 torture_type, cpu);
1531 n_online_successes++; 1554 n_online_successes++;
1555 delta = jiffies - starttime;
1556 sum_online += delta;
1557 if (min_online < 0) {
1558 min_online = delta;
1559 max_online = delta;
1560 }
1561 if (min_online > delta)
1562 min_online = delta;
1563 if (max_online < delta)
1564 max_online = delta;
1532 } 1565 }
1533 } 1566 }
1534 schedule_timeout_interruptible(onoff_interval * HZ); 1567 schedule_timeout_interruptible(onoff_interval * HZ);
@@ -1593,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args)
1593 if (!kthread_should_stop()) { 1626 if (!kthread_should_stop()) {
1594 stop_at = get_seconds() + stall_cpu; 1627 stop_at = get_seconds() + stall_cpu;
1595 /* RCU CPU stall is expected behavior in following code. */ 1628 /* RCU CPU stall is expected behavior in following code. */
1596 printk(KERN_ALERT "rcu_torture_stall start.\n"); 1629 pr_alert("rcu_torture_stall start.\n");
1597 rcu_read_lock(); 1630 rcu_read_lock();
1598 preempt_disable(); 1631 preempt_disable();
1599 while (ULONG_CMP_LT(get_seconds(), stop_at)) 1632 while (ULONG_CMP_LT(get_seconds(), stop_at))
1600 continue; /* Induce RCU CPU stall warning. */ 1633 continue; /* Induce RCU CPU stall warning. */
1601 preempt_enable(); 1634 preempt_enable();
1602 rcu_read_unlock(); 1635 rcu_read_unlock();
1603 printk(KERN_ALERT "rcu_torture_stall end.\n"); 1636 pr_alert("rcu_torture_stall end.\n");
1604 } 1637 }
1605 rcutorture_shutdown_absorb("rcu_torture_stall"); 1638 rcutorture_shutdown_absorb("rcu_torture_stall");
1606 while (!kthread_should_stop()) 1639 while (!kthread_should_stop())
@@ -1716,12 +1749,12 @@ static int rcu_torture_barrier_init(void)
1716 if (n_barrier_cbs == 0) 1749 if (n_barrier_cbs == 0)
1717 return 0; 1750 return 0;
1718 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { 1751 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1719 printk(KERN_ALERT "%s" TORTURE_FLAG 1752 pr_alert("%s" TORTURE_FLAG
1720 " Call or barrier ops missing for %s,\n", 1753 " Call or barrier ops missing for %s,\n",
1721 torture_type, cur_ops->name); 1754 torture_type, cur_ops->name);
1722 printk(KERN_ALERT "%s" TORTURE_FLAG 1755 pr_alert("%s" TORTURE_FLAG
1723 " RCU barrier testing omitted from run.\n", 1756 " RCU barrier testing omitted from run.\n",
1724 torture_type); 1757 torture_type);
1725 return 0; 1758 return 0;
1726 } 1759 }
1727 atomic_set(&barrier_cbs_count, 0); 1760 atomic_set(&barrier_cbs_count, 0);
@@ -1814,7 +1847,7 @@ rcu_torture_cleanup(void)
1814 mutex_lock(&fullstop_mutex); 1847 mutex_lock(&fullstop_mutex);
1815 rcutorture_record_test_transition(); 1848 rcutorture_record_test_transition();
1816 if (fullstop == FULLSTOP_SHUTDOWN) { 1849 if (fullstop == FULLSTOP_SHUTDOWN) {
1817 printk(KERN_WARNING /* but going down anyway, so... */ 1850 pr_warn(/* but going down anyway, so... */
1818 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1851 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1819 mutex_unlock(&fullstop_mutex); 1852 mutex_unlock(&fullstop_mutex);
1820 schedule_timeout_uninterruptible(10); 1853 schedule_timeout_uninterruptible(10);
@@ -1938,17 +1971,17 @@ rcu_torture_init(void)
1938 break; 1971 break;
1939 } 1972 }
1940 if (i == ARRAY_SIZE(torture_ops)) { 1973 if (i == ARRAY_SIZE(torture_ops)) {
1941 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", 1974 pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
1942 torture_type); 1975 torture_type);
1943 printk(KERN_ALERT "rcu-torture types:"); 1976 pr_alert("rcu-torture types:");
1944 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1977 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1945 printk(KERN_ALERT " %s", torture_ops[i]->name); 1978 pr_alert(" %s", torture_ops[i]->name);
1946 printk(KERN_ALERT "\n"); 1979 pr_alert("\n");
1947 mutex_unlock(&fullstop_mutex); 1980 mutex_unlock(&fullstop_mutex);
1948 return -EINVAL; 1981 return -EINVAL;
1949 } 1982 }
1950 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1983 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); 1984 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1952 fqs_duration = 0; 1985 fqs_duration = 0;
1953 } 1986 }
1954 if (cur_ops->init) 1987 if (cur_ops->init)
@@ -1996,14 +2029,15 @@ rcu_torture_init(void)
1996 /* Start up the kthreads. */ 2029 /* Start up the kthreads. */
1997 2030
1998 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 2031 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
1999 writer_task = kthread_run(rcu_torture_writer, NULL, 2032 writer_task = kthread_create(rcu_torture_writer, NULL,
2000 "rcu_torture_writer"); 2033 "rcu_torture_writer");
2001 if (IS_ERR(writer_task)) { 2034 if (IS_ERR(writer_task)) {
2002 firsterr = PTR_ERR(writer_task); 2035 firsterr = PTR_ERR(writer_task);
2003 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 2036 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
2004 writer_task = NULL; 2037 writer_task = NULL;
2005 goto unwind; 2038 goto unwind;
2006 } 2039 }
2040 wake_up_process(writer_task);
2007 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 2041 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
2008 GFP_KERNEL); 2042 GFP_KERNEL);
2009 if (fakewriter_tasks == NULL) { 2043 if (fakewriter_tasks == NULL) {
@@ -2118,14 +2152,15 @@ rcu_torture_init(void)
2118 } 2152 }
2119 if (shutdown_secs > 0) { 2153 if (shutdown_secs > 0) {
2120 shutdown_time = jiffies + shutdown_secs * HZ; 2154 shutdown_time = jiffies + shutdown_secs * HZ;
2121 shutdown_task = kthread_run(rcu_torture_shutdown, NULL, 2155 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2122 "rcu_torture_shutdown"); 2156 "rcu_torture_shutdown");
2123 if (IS_ERR(shutdown_task)) { 2157 if (IS_ERR(shutdown_task)) {
2124 firsterr = PTR_ERR(shutdown_task); 2158 firsterr = PTR_ERR(shutdown_task);
2125 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); 2159 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2126 shutdown_task = NULL; 2160 shutdown_task = NULL;
2127 goto unwind; 2161 goto unwind;
2128 } 2162 }
2163 wake_up_process(shutdown_task);
2129 } 2164 }
2130 i = rcu_torture_onoff_init(); 2165 i = rcu_torture_onoff_init();
2131 if (i != 0) { 2166 if (i != 0) {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e542e3e9..7387e46009d9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,6 +52,7 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h>
55 56
56#include "rcutree.h" 57#include "rcutree.h"
57#include <trace/events/rcu.h> 58#include <trace/events/rcu.h>
@@ -61,6 +62,7 @@
61/* Data structures. */ 62/* Data structures. */
62 63
63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
64 66
65#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
@@ -72,7 +74,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
73 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
76 .name = #sname, \ 77 .name = #sname, \
77} 78}
78 79
@@ -88,7 +89,7 @@ LIST_HEAD(rcu_struct_flavors);
88 89
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 90/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; 91static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0); 92module_param(rcu_fanout_leaf, int, 0444);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 93int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ 94static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0, 95 NUM_RCU_LVL_0,
@@ -133,13 +134,12 @@ static int rcu_scheduler_fully_active __read_mostly;
133 */ 134 */
134static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); 135static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
135DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 136DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
136DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
138DEFINE_PER_CPU(char, rcu_cpu_has_work); 138DEFINE_PER_CPU(char, rcu_cpu_has_work);
139 139
140#endif /* #ifdef CONFIG_RCU_BOOST */ 140#endif /* #ifdef CONFIG_RCU_BOOST */
141 141
142static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 142static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
143static void invoke_rcu_core(void); 143static void invoke_rcu_core(void);
144static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 144static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145 145
@@ -175,8 +175,6 @@ void rcu_sched_qs(int cpu)
175{ 175{
176 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 176 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
177 177
178 rdp->passed_quiesce_gpnum = rdp->gpnum;
179 barrier();
180 if (rdp->passed_quiesce == 0) 178 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 179 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
182 rdp->passed_quiesce = 1; 180 rdp->passed_quiesce = 1;
@@ -186,8 +184,6 @@ void rcu_bh_qs(int cpu)
186{ 184{
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 185 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 186
189 rdp->passed_quiesce_gpnum = rdp->gpnum;
190 barrier();
191 if (rdp->passed_quiesce == 0) 187 if (rdp->passed_quiesce == 0)
192 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 188 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
193 rdp->passed_quiesce = 1; 189 rdp->passed_quiesce = 1;
@@ -216,9 +212,9 @@ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 212static int qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 213static int qlowmark = 100; /* Once only this many pending, use blimit. */
218 214
219module_param(blimit, int, 0); 215module_param(blimit, int, 0444);
220module_param(qhimark, int, 0); 216module_param(qhimark, int, 0444);
221module_param(qlowmark, int, 0); 217module_param(qlowmark, int, 0444);
222 218
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 219int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 220int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -226,7 +222,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
226module_param(rcu_cpu_stall_suppress, int, 0644); 222module_param(rcu_cpu_stall_suppress, int, 0644);
227module_param(rcu_cpu_stall_timeout, int, 0644); 223module_param(rcu_cpu_stall_timeout, int, 0644);
228 224
229static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 225static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
226static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
227
228module_param(jiffies_till_first_fqs, ulong, 0644);
229module_param(jiffies_till_next_fqs, ulong, 0644);
230
231static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
232static void force_quiescent_state(struct rcu_state *rsp);
230static int rcu_pending(int cpu); 233static int rcu_pending(int cpu);
231 234
232/* 235/*
@@ -252,7 +255,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
252 */ 255 */
253void rcu_bh_force_quiescent_state(void) 256void rcu_bh_force_quiescent_state(void)
254{ 257{
255 force_quiescent_state(&rcu_bh_state, 0); 258 force_quiescent_state(&rcu_bh_state);
256} 259}
257EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 260EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
258 261
@@ -286,7 +289,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
286 */ 289 */
287void rcu_sched_force_quiescent_state(void) 290void rcu_sched_force_quiescent_state(void)
288{ 291{
289 force_quiescent_state(&rcu_sched_state, 0); 292 force_quiescent_state(&rcu_sched_state);
290} 293}
291EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 294EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
292 295
@@ -305,7 +308,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305static int 308static int
306cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 309cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
307{ 310{
308 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); 311 return *rdp->nxttail[RCU_DONE_TAIL +
312 ACCESS_ONCE(rsp->completed) != rdp->completed] &&
313 !rcu_gp_in_progress(rsp);
309} 314}
310 315
311/* 316/*
@@ -317,35 +322,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317} 322}
318 323
319/* 324/*
320 * If the specified CPU is offline, tell the caller that it is in
321 * a quiescent state. Otherwise, whack it with a reschedule IPI.
322 * Grace periods can end up waiting on an offline CPU when that
323 * CPU is in the process of coming online -- it will be added to the
324 * rcu_node bitmasks before it actually makes it online. The same thing
325 * can happen while a CPU is in the process of coming online. Because this
326 * race is quite rare, we check for it after detecting that the grace
327 * period has been delayed rather than checking each and every CPU
328 * each and every time we start a new grace period.
329 */
330static int rcu_implicit_offline_qs(struct rcu_data *rdp)
331{
332 /*
333 * If the CPU is offline for more than a jiffy, it is in a quiescent
334 * state. We can trust its state not to change because interrupts
335 * are disabled. The reason for the jiffy's worth of slack is to
336 * handle CPUs initializing on the way up and finding their way
337 * to the idle loop on the way down.
338 */
339 if (cpu_is_offline(rdp->cpu) &&
340 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
341 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
342 rdp->offline_fqs++;
343 return 1;
344 }
345 return 0;
346}
347
348/*
349 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle 325 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
350 * 326 *
351 * If the new value of the ->dynticks_nesting counter now is zero, 327 * If the new value of the ->dynticks_nesting counter now is zero,
@@ -673,7 +649,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
673 * Return true if the specified CPU has passed through a quiescent 649 * Return true if the specified CPU has passed through a quiescent
674 * state by virtue of being in or having passed through an dynticks 650 * state by virtue of being in or having passed through an dynticks
675 * idle state since the last call to dyntick_save_progress_counter() 651 * idle state since the last call to dyntick_save_progress_counter()
676 * for this same CPU. 652 * for this same CPU, or by virtue of having been offline.
677 */ 653 */
678static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 654static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
679{ 655{
@@ -697,8 +673,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
697 return 1; 673 return 1;
698 } 674 }
699 675
700 /* Go check for the CPU being offline. */ 676 /*
701 return rcu_implicit_offline_qs(rdp); 677 * Check for the CPU being offline, but only if the grace period
678 * is old enough. We don't need to worry about the CPU changing
679 * state: If we see it offline even once, it has been through a
680 * quiescent state.
681 *
682 * The reason for insisting that the grace period be at least
683 * one jiffy old is that CPUs that are not quite online and that
684 * have just gone offline can still execute RCU read-side critical
685 * sections.
686 */
687 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
688 return 0; /* Grace period is not old enough. */
689 barrier();
690 if (cpu_is_offline(rdp->cpu)) {
691 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
692 rdp->offline_fqs++;
693 return 1;
694 }
695 return 0;
702} 696}
703 697
704static int jiffies_till_stall_check(void) 698static int jiffies_till_stall_check(void)
@@ -755,14 +749,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
755 rcu_for_each_leaf_node(rsp, rnp) { 749 rcu_for_each_leaf_node(rsp, rnp) {
756 raw_spin_lock_irqsave(&rnp->lock, flags); 750 raw_spin_lock_irqsave(&rnp->lock, flags);
757 ndetected += rcu_print_task_stall(rnp); 751 ndetected += rcu_print_task_stall(rnp);
752 if (rnp->qsmask != 0) {
753 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
754 if (rnp->qsmask & (1UL << cpu)) {
755 print_cpu_stall_info(rsp,
756 rnp->grplo + cpu);
757 ndetected++;
758 }
759 }
758 raw_spin_unlock_irqrestore(&rnp->lock, flags); 760 raw_spin_unlock_irqrestore(&rnp->lock, flags);
759 if (rnp->qsmask == 0)
760 continue;
761 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
762 if (rnp->qsmask & (1UL << cpu)) {
763 print_cpu_stall_info(rsp, rnp->grplo + cpu);
764 ndetected++;
765 }
766 } 761 }
767 762
768 /* 763 /*
@@ -782,11 +777,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
782 else if (!trigger_all_cpu_backtrace()) 777 else if (!trigger_all_cpu_backtrace())
783 dump_stack(); 778 dump_stack();
784 779
785 /* If so configured, complain about tasks blocking the grace period. */ 780 /* Complain about tasks blocking the grace period. */
786 781
787 rcu_print_detail_task_stall(rsp); 782 rcu_print_detail_task_stall(rsp);
788 783
789 force_quiescent_state(rsp, 0); /* Kick them all. */ 784 force_quiescent_state(rsp); /* Kick them all. */
790} 785}
791 786
792static void print_cpu_stall(struct rcu_state *rsp) 787static void print_cpu_stall(struct rcu_state *rsp)
@@ -827,7 +822,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
827 j = ACCESS_ONCE(jiffies); 822 j = ACCESS_ONCE(jiffies);
828 js = ACCESS_ONCE(rsp->jiffies_stall); 823 js = ACCESS_ONCE(rsp->jiffies_stall);
829 rnp = rdp->mynode; 824 rnp = rdp->mynode;
830 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 825 if (rcu_gp_in_progress(rsp) &&
826 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
831 827
832 /* We haven't checked in, so go dump stack. */ 828 /* We haven't checked in, so go dump stack. */
833 print_cpu_stall(rsp); 829 print_cpu_stall(rsp);
@@ -889,12 +885,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
889 */ 885 */
890 rdp->gpnum = rnp->gpnum; 886 rdp->gpnum = rnp->gpnum;
891 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 887 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
892 if (rnp->qsmask & rdp->grpmask) { 888 rdp->passed_quiesce = 0;
893 rdp->qs_pending = 1; 889 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
894 rdp->passed_quiesce = 0;
895 } else {
896 rdp->qs_pending = 0;
897 }
898 zero_cpu_stall_ticks(rdp); 890 zero_cpu_stall_ticks(rdp);
899 } 891 }
900} 892}
@@ -974,10 +966,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
974 * our behalf. Catch up with this state to avoid noting 966 * our behalf. Catch up with this state to avoid noting
975 * spurious new grace periods. If another grace period 967 * spurious new grace periods. If another grace period
976 * has started, then rnp->gpnum will have advanced, so 968 * has started, then rnp->gpnum will have advanced, so
977 * we will detect this later on. 969 * we will detect this later on. Of course, any quiescent
970 * states we found for the old GP are now invalid.
978 */ 971 */
979 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) 972 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
980 rdp->gpnum = rdp->completed; 973 rdp->gpnum = rdp->completed;
974 rdp->passed_quiesce = 0;
975 }
981 976
982 /* 977 /*
983 * If RCU does not need a quiescent state from this CPU, 978 * If RCU does not need a quiescent state from this CPU,
@@ -1021,97 +1016,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1021 /* Prior grace period ended, so advance callbacks for current CPU. */ 1016 /* Prior grace period ended, so advance callbacks for current CPU. */
1022 __rcu_process_gp_end(rsp, rnp, rdp); 1017 __rcu_process_gp_end(rsp, rnp, rdp);
1023 1018
1024 /*
1025 * Because this CPU just now started the new grace period, we know
1026 * that all of its callbacks will be covered by this upcoming grace
1027 * period, even the ones that were registered arbitrarily recently.
1028 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
1029 *
1030 * Other CPUs cannot be sure exactly when the grace period started.
1031 * Therefore, their recently registered callbacks must pass through
1032 * an additional RCU_NEXT_READY stage, so that they will be handled
1033 * by the next RCU grace period.
1034 */
1035 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1036 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1037
1038 /* Set state so that this CPU will detect the next quiescent state. */ 1019 /* Set state so that this CPU will detect the next quiescent state. */
1039 __note_new_gpnum(rsp, rnp, rdp); 1020 __note_new_gpnum(rsp, rnp, rdp);
1040} 1021}
1041 1022
1042/* 1023/*
1043 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1024 * Initialize a new grace period.
1044 * in preparation for detecting the next grace period. The caller must hold
1045 * the root node's ->lock, which is released before return. Hard irqs must
1046 * be disabled.
1047 *
1048 * Note that it is legal for a dying CPU (which is marked as offline) to
1049 * invoke this function. This can happen when the dying CPU reports its
1050 * quiescent state.
1051 */ 1025 */
1052static void 1026static int rcu_gp_init(struct rcu_state *rsp)
1053rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1054 __releases(rcu_get_root(rsp)->lock)
1055{ 1027{
1056 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1028 struct rcu_data *rdp;
1057 struct rcu_node *rnp = rcu_get_root(rsp); 1029 struct rcu_node *rnp = rcu_get_root(rsp);
1058 1030
1059 if (!rcu_scheduler_fully_active || 1031 raw_spin_lock_irq(&rnp->lock);
1060 !cpu_needs_another_gp(rsp, rdp)) { 1032 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1061 /*
1062 * Either the scheduler hasn't yet spawned the first
1063 * non-idle task or this CPU does not need another
1064 * grace period. Either way, don't start a new grace
1065 * period.
1066 */
1067 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1068 return;
1069 }
1070 1033
1071 if (rsp->fqs_active) { 1034 if (rcu_gp_in_progress(rsp)) {
1072 /* 1035 /* Grace period already in progress, don't start another. */
1073 * This CPU needs a grace period, but force_quiescent_state() 1036 raw_spin_unlock_irq(&rnp->lock);
1074 * is running. Tell it to start one on this CPU's behalf. 1037 return 0;
1075 */
1076 rsp->fqs_need_gp = 1;
1077 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1078 return;
1079 } 1038 }
1080 1039
1081 /* Advance to a new grace period and initialize state. */ 1040 /* Advance to a new grace period and initialize state. */
1082 rsp->gpnum++; 1041 rsp->gpnum++;
1083 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1042 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
1084 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
1085 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
1086 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1087 record_gp_stall_check_time(rsp); 1043 record_gp_stall_check_time(rsp);
1088 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1044 raw_spin_unlock_irq(&rnp->lock);
1089 1045
1090 /* Exclude any concurrent CPU-hotplug operations. */ 1046 /* Exclude any concurrent CPU-hotplug operations. */
1091 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1047 get_online_cpus();
1092 1048
1093 /* 1049 /*
1094 * Set the quiescent-state-needed bits in all the rcu_node 1050 * Set the quiescent-state-needed bits in all the rcu_node
1095 * structures for all currently online CPUs in breadth-first 1051 * structures for all currently online CPUs in breadth-first order,
1096 * order, starting from the root rcu_node structure. This 1052 * starting from the root rcu_node structure, relying on the layout
1097 * operation relies on the layout of the hierarchy within the 1053 * of the tree within the rsp->node[] array. Note that other CPUs
1098 * rsp->node[] array. Note that other CPUs will access only 1054 * will access only the leaves of the hierarchy, thus seeing that no
1099 * the leaves of the hierarchy, which still indicate that no
1100 * grace period is in progress, at least until the corresponding 1055 * grace period is in progress, at least until the corresponding
1101 * leaf node has been initialized. In addition, we have excluded 1056 * leaf node has been initialized. In addition, we have excluded
1102 * CPU-hotplug operations. 1057 * CPU-hotplug operations.
1103 * 1058 *
1104 * Note that the grace period cannot complete until we finish 1059 * The grace period cannot complete until the initialization
1105 * the initialization process, as there will be at least one 1060 * process finishes, because this kthread handles both.
1106 * qsmask bit set in the root node until that time, namely the
1107 * one corresponding to this CPU, due to the fact that we have
1108 * irqs disabled.
1109 */ 1061 */
1110 rcu_for_each_node_breadth_first(rsp, rnp) { 1062 rcu_for_each_node_breadth_first(rsp, rnp) {
1111 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1063 raw_spin_lock_irq(&rnp->lock);
1064 rdp = this_cpu_ptr(rsp->rda);
1112 rcu_preempt_check_blocked_tasks(rnp); 1065 rcu_preempt_check_blocked_tasks(rnp);
1113 rnp->qsmask = rnp->qsmaskinit; 1066 rnp->qsmask = rnp->qsmaskinit;
1114 rnp->gpnum = rsp->gpnum; 1067 rnp->gpnum = rsp->gpnum;
1068 WARN_ON_ONCE(rnp->completed != rsp->completed);
1115 rnp->completed = rsp->completed; 1069 rnp->completed = rsp->completed;
1116 if (rnp == rdp->mynode) 1070 if (rnp == rdp->mynode)
1117 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1071 rcu_start_gp_per_cpu(rsp, rnp, rdp);
@@ -1119,37 +1073,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1119 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1073 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1120 rnp->level, rnp->grplo, 1074 rnp->level, rnp->grplo,
1121 rnp->grphi, rnp->qsmask); 1075 rnp->grphi, rnp->qsmask);
1122 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1076 raw_spin_unlock_irq(&rnp->lock);
1077#ifdef CONFIG_PROVE_RCU_DELAY
1078 if ((random32() % (rcu_num_nodes * 8)) == 0)
1079 schedule_timeout_uninterruptible(2);
1080#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1081 cond_resched();
1123 } 1082 }
1124 1083
1125 rnp = rcu_get_root(rsp); 1084 put_online_cpus();
1126 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1085 return 1;
1127 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
1128 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1129 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1130} 1086}
1131 1087
1132/* 1088/*
1133 * Report a full set of quiescent states to the specified rcu_state 1089 * Do one round of quiescent-state forcing.
1134 * data structure. This involves cleaning up after the prior grace
1135 * period and letting rcu_start_gp() start up the next grace period
1136 * if one is needed. Note that the caller must hold rnp->lock, as
1137 * required by rcu_start_gp(), which will release it.
1138 */ 1090 */
1139static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1091int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1140 __releases(rcu_get_root(rsp)->lock)
1141{ 1092{
1142 unsigned long gp_duration; 1093 int fqs_state = fqs_state_in;
1143 struct rcu_node *rnp = rcu_get_root(rsp); 1094 struct rcu_node *rnp = rcu_get_root(rsp);
1144 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1145 1095
1146 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1096 rsp->n_force_qs++;
1097 if (fqs_state == RCU_SAVE_DYNTICK) {
1098 /* Collect dyntick-idle snapshots. */
1099 force_qs_rnp(rsp, dyntick_save_progress_counter);
1100 fqs_state = RCU_FORCE_QS;
1101 } else {
1102 /* Handle dyntick-idle and offline CPUs. */
1103 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1104 }
1105 /* Clear flag to prevent immediate re-entry. */
1106 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1107 raw_spin_lock_irq(&rnp->lock);
1108 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1109 raw_spin_unlock_irq(&rnp->lock);
1110 }
1111 return fqs_state;
1112}
1147 1113
1148 /* 1114/*
1149 * Ensure that all grace-period and pre-grace-period activity 1115 * Clean up after the old grace period.
1150 * is seen before the assignment to rsp->completed. 1116 */
1151 */ 1117static void rcu_gp_cleanup(struct rcu_state *rsp)
1152 smp_mb(); /* See above block comment. */ 1118{
1119 unsigned long gp_duration;
1120 struct rcu_data *rdp;
1121 struct rcu_node *rnp = rcu_get_root(rsp);
1122
1123 raw_spin_lock_irq(&rnp->lock);
1153 gp_duration = jiffies - rsp->gp_start; 1124 gp_duration = jiffies - rsp->gp_start;
1154 if (gp_duration > rsp->gp_max) 1125 if (gp_duration > rsp->gp_max)
1155 rsp->gp_max = gp_duration; 1126 rsp->gp_max = gp_duration;
@@ -1161,35 +1132,149 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1161 * they can do to advance the grace period. It is therefore 1132 * they can do to advance the grace period. It is therefore
1162 * safe for us to drop the lock in order to mark the grace 1133 * safe for us to drop the lock in order to mark the grace
1163 * period as completed in all of the rcu_node structures. 1134 * period as completed in all of the rcu_node structures.
1164 *
1165 * But if this CPU needs another grace period, it will take
1166 * care of this while initializing the next grace period.
1167 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
1168 * because the callbacks have not yet been advanced: Those
1169 * callbacks are waiting on the grace period that just now
1170 * completed.
1171 */ 1135 */
1172 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { 1136 raw_spin_unlock_irq(&rnp->lock);
1173 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1174 1137
1175 /* 1138 /*
1176 * Propagate new ->completed value to rcu_node structures 1139 * Propagate new ->completed value to rcu_node structures so
1177 * so that other CPUs don't have to wait until the start 1140 * that other CPUs don't have to wait until the start of the next
1178 * of the next grace period to process their callbacks. 1141 * grace period to process their callbacks. This also avoids
1179 */ 1142 * some nasty RCU grace-period initialization races by forcing
1180 rcu_for_each_node_breadth_first(rsp, rnp) { 1143 * the end of the current grace period to be completely recorded in
1181 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1144 * all of the rcu_node structures before the beginning of the next
1182 rnp->completed = rsp->gpnum; 1145 * grace period is recorded in any of the rcu_node structures.
1183 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1146 */
1184 } 1147 rcu_for_each_node_breadth_first(rsp, rnp) {
1185 rnp = rcu_get_root(rsp); 1148 raw_spin_lock_irq(&rnp->lock);
1186 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1149 rnp->completed = rsp->gpnum;
1150 raw_spin_unlock_irq(&rnp->lock);
1151 cond_resched();
1187 } 1152 }
1153 rnp = rcu_get_root(rsp);
1154 raw_spin_lock_irq(&rnp->lock);
1188 1155
1189 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1156 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1190 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1157 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1191 rsp->fqs_state = RCU_GP_IDLE; 1158 rsp->fqs_state = RCU_GP_IDLE;
1192 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1159 rdp = this_cpu_ptr(rsp->rda);
1160 if (cpu_needs_another_gp(rsp, rdp))
1161 rsp->gp_flags = 1;
1162 raw_spin_unlock_irq(&rnp->lock);
1163}
1164
1165/*
1166 * Body of kthread that handles grace periods.
1167 */
1168static int __noreturn rcu_gp_kthread(void *arg)
1169{
1170 int fqs_state;
1171 unsigned long j;
1172 int ret;
1173 struct rcu_state *rsp = arg;
1174 struct rcu_node *rnp = rcu_get_root(rsp);
1175
1176 for (;;) {
1177
1178 /* Handle grace-period start. */
1179 for (;;) {
1180 wait_event_interruptible(rsp->gp_wq,
1181 rsp->gp_flags &
1182 RCU_GP_FLAG_INIT);
1183 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
1184 rcu_gp_init(rsp))
1185 break;
1186 cond_resched();
1187 flush_signals(current);
1188 }
1189
1190 /* Handle quiescent-state forcing. */
1191 fqs_state = RCU_SAVE_DYNTICK;
1192 j = jiffies_till_first_fqs;
1193 if (j > HZ) {
1194 j = HZ;
1195 jiffies_till_first_fqs = HZ;
1196 }
1197 for (;;) {
1198 rsp->jiffies_force_qs = jiffies + j;
1199 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1200 (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
1201 (!ACCESS_ONCE(rnp->qsmask) &&
1202 !rcu_preempt_blocked_readers_cgp(rnp)),
1203 j);
1204 /* If grace period done, leave loop. */
1205 if (!ACCESS_ONCE(rnp->qsmask) &&
1206 !rcu_preempt_blocked_readers_cgp(rnp))
1207 break;
1208 /* If time for quiescent-state forcing, do it. */
1209 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
1210 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1211 cond_resched();
1212 } else {
1213 /* Deal with stray signal. */
1214 cond_resched();
1215 flush_signals(current);
1216 }
1217 j = jiffies_till_next_fqs;
1218 if (j > HZ) {
1219 j = HZ;
1220 jiffies_till_next_fqs = HZ;
1221 } else if (j < 1) {
1222 j = 1;
1223 jiffies_till_next_fqs = 1;
1224 }
1225 }
1226
1227 /* Handle grace-period end. */
1228 rcu_gp_cleanup(rsp);
1229 }
1230}
1231
1232/*
1233 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1234 * in preparation for detecting the next grace period. The caller must hold
1235 * the root node's ->lock, which is released before return. Hard irqs must
1236 * be disabled.
1237 *
1238 * Note that it is legal for a dying CPU (which is marked as offline) to
1239 * invoke this function. This can happen when the dying CPU reports its
1240 * quiescent state.
1241 */
1242static void
1243rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1244 __releases(rcu_get_root(rsp)->lock)
1245{
1246 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1247 struct rcu_node *rnp = rcu_get_root(rsp);
1248
1249 if (!rsp->gp_kthread ||
1250 !cpu_needs_another_gp(rsp, rdp)) {
1251 /*
1252 * Either we have not yet spawned the grace-period
1253 * task or this CPU does not need another grace period.
1254 * Either way, don't start a new grace period.
1255 */
1256 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1257 return;
1258 }
1259
1260 rsp->gp_flags = RCU_GP_FLAG_INIT;
1261 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1262 wake_up(&rsp->gp_wq);
1263}
1264
1265/*
1266 * Report a full set of quiescent states to the specified rcu_state
1267 * data structure. This involves cleaning up after the prior grace
1268 * period and letting rcu_start_gp() start up the next grace period
1269 * if one is needed. Note that the caller must hold rnp->lock, as
1270 * required by rcu_start_gp(), which will release it.
1271 */
1272static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1273 __releases(rcu_get_root(rsp)->lock)
1274{
1275 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1276 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1277 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1193} 1278}
1194 1279
1195/* 1280/*
@@ -1258,7 +1343,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1258 * based on quiescent states detected in an earlier grace period! 1343 * based on quiescent states detected in an earlier grace period!
1259 */ 1344 */
1260static void 1345static void
1261rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) 1346rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1262{ 1347{
1263 unsigned long flags; 1348 unsigned long flags;
1264 unsigned long mask; 1349 unsigned long mask;
@@ -1266,7 +1351,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1266 1351
1267 rnp = rdp->mynode; 1352 rnp = rdp->mynode;
1268 raw_spin_lock_irqsave(&rnp->lock, flags); 1353 raw_spin_lock_irqsave(&rnp->lock, flags);
1269 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { 1354 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1355 rnp->completed == rnp->gpnum) {
1270 1356
1271 /* 1357 /*
1272 * The grace period in which this quiescent state was 1358 * The grace period in which this quiescent state was
@@ -1325,7 +1411,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1325 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1411 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1326 * judge of that). 1412 * judge of that).
1327 */ 1413 */
1328 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); 1414 rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
1329} 1415}
1330 1416
1331#ifdef CONFIG_HOTPLUG_CPU 1417#ifdef CONFIG_HOTPLUG_CPU
@@ -1390,17 +1476,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1390 int i; 1476 int i;
1391 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1477 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1392 1478
1393 /*
1394 * If there is an rcu_barrier() operation in progress, then
1395 * only the task doing that operation is permitted to adopt
1396 * callbacks. To do otherwise breaks rcu_barrier() and friends
1397 * by causing them to fail to wait for the callbacks in the
1398 * orphanage.
1399 */
1400 if (rsp->rcu_barrier_in_progress &&
1401 rsp->rcu_barrier_in_progress != current)
1402 return;
1403
1404 /* Do the accounting first. */ 1479 /* Do the accounting first. */
1405 rdp->qlen_lazy += rsp->qlen_lazy; 1480 rdp->qlen_lazy += rsp->qlen_lazy;
1406 rdp->qlen += rsp->qlen; 1481 rdp->qlen += rsp->qlen;
@@ -1455,9 +1530,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1455 * The CPU has been completely removed, and some other CPU is reporting 1530 * The CPU has been completely removed, and some other CPU is reporting
1456 * this fact from process context. Do the remainder of the cleanup, 1531 * this fact from process context. Do the remainder of the cleanup,
1457 * including orphaning the outgoing CPU's RCU callbacks, and also 1532 * including orphaning the outgoing CPU's RCU callbacks, and also
1458 * adopting them, if there is no _rcu_barrier() instance running. 1533 * adopting them. There can only be one CPU hotplug operation at a time,
1459 * There can only be one CPU hotplug operation at a time, so no other 1534 * so no other CPU can be attempting to update rcu_cpu_kthread_task.
1460 * CPU can be attempting to update rcu_cpu_kthread_task.
1461 */ 1535 */
1462static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 1536static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1463{ 1537{
@@ -1468,8 +1542,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1468 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 1542 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1469 1543
1470 /* Adjust any no-longer-needed kthreads. */ 1544 /* Adjust any no-longer-needed kthreads. */
1471 rcu_stop_cpu_kthread(cpu); 1545 rcu_boost_kthread_setaffinity(rnp, -1);
1472 rcu_node_kthread_setaffinity(rnp, -1);
1473 1546
1474 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1547 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1475 1548
@@ -1515,14 +1588,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 1588 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 1589 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist); 1590 cpu, rdp->qlen, rdp->nxtlist);
1591 init_callback_list(rdp);
1592 /* Disallow further callbacks on this CPU. */
1593 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1518} 1594}
1519 1595
1520#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1596#else /* #ifdef CONFIG_HOTPLUG_CPU */
1521 1597
1522static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1523{
1524}
1525
1526static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1598static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1527{ 1599{
1528} 1600}
@@ -1687,6 +1759,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1687 struct rcu_node *rnp; 1759 struct rcu_node *rnp;
1688 1760
1689 rcu_for_each_leaf_node(rsp, rnp) { 1761 rcu_for_each_leaf_node(rsp, rnp) {
1762 cond_resched();
1690 mask = 0; 1763 mask = 0;
1691 raw_spin_lock_irqsave(&rnp->lock, flags); 1764 raw_spin_lock_irqsave(&rnp->lock, flags);
1692 if (!rcu_gp_in_progress(rsp)) { 1765 if (!rcu_gp_in_progress(rsp)) {
@@ -1723,72 +1796,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1723 * Force quiescent states on reluctant CPUs, and also detect which 1796 * Force quiescent states on reluctant CPUs, and also detect which
1724 * CPUs are in dyntick-idle mode. 1797 * CPUs are in dyntick-idle mode.
1725 */ 1798 */
1726static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1799static void force_quiescent_state(struct rcu_state *rsp)
1727{ 1800{
1728 unsigned long flags; 1801 unsigned long flags;
1729 struct rcu_node *rnp = rcu_get_root(rsp); 1802 bool ret;
1730 1803 struct rcu_node *rnp;
1731 trace_rcu_utilization("Start fqs"); 1804 struct rcu_node *rnp_old = NULL;
1732 if (!rcu_gp_in_progress(rsp)) { 1805
1733 trace_rcu_utilization("End fqs"); 1806 /* Funnel through hierarchy to reduce memory contention. */
1734 return; /* No grace period in progress, nothing to force. */ 1807 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
1735 } 1808 for (; rnp != NULL; rnp = rnp->parent) {
1736 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1809 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
1737 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1810 !raw_spin_trylock(&rnp->fqslock);
1738 trace_rcu_utilization("End fqs"); 1811 if (rnp_old != NULL)
1739 return; /* Someone else is already on the job. */ 1812 raw_spin_unlock(&rnp_old->fqslock);
1740 } 1813 if (ret) {
1741 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1814 rsp->n_force_qs_lh++;
1742 goto unlock_fqs_ret; /* no emergency and done recently. */ 1815 return;
1743 rsp->n_force_qs++; 1816 }
1744 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1817 rnp_old = rnp;
1745 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1746 if(!rcu_gp_in_progress(rsp)) {
1747 rsp->n_force_qs_ngp++;
1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1749 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1750 }
1751 rsp->fqs_active = 1;
1752 switch (rsp->fqs_state) {
1753 case RCU_GP_IDLE:
1754 case RCU_GP_INIT:
1755
1756 break; /* grace period idle or initializing, ignore. */
1757
1758 case RCU_SAVE_DYNTICK:
1759
1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1761
1762 /* Record dyntick-idle state. */
1763 force_qs_rnp(rsp, dyntick_save_progress_counter);
1764 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1765 if (rcu_gp_in_progress(rsp))
1766 rsp->fqs_state = RCU_FORCE_QS;
1767 break;
1768
1769 case RCU_FORCE_QS:
1770
1771 /* Check dyntick-idle state, send IPI to laggarts. */
1772 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1773 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1774
1775 /* Leave state in case more forcing is required. */
1776
1777 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1778 break;
1779 } 1818 }
1780 rsp->fqs_active = 0; 1819 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
1781 if (rsp->fqs_need_gp) { 1820
1782 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1821 /* Reached the root of the rcu_node tree, acquire lock. */
1783 rsp->fqs_need_gp = 0; 1822 raw_spin_lock_irqsave(&rnp_old->lock, flags);
1784 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1823 raw_spin_unlock(&rnp_old->fqslock);
1785 trace_rcu_utilization("End fqs"); 1824 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1786 return; 1825 rsp->n_force_qs_lh++;
1826 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1827 return; /* Someone beat us to it. */
1787 } 1828 }
1788 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1829 rsp->gp_flags |= RCU_GP_FLAG_FQS;
1789unlock_fqs_ret: 1830 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1790 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1831 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1791 trace_rcu_utilization("End fqs");
1792} 1832}
1793 1833
1794/* 1834/*
@@ -1805,13 +1845,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
1805 WARN_ON_ONCE(rdp->beenonline == 0); 1845 WARN_ON_ONCE(rdp->beenonline == 0);
1806 1846
1807 /* 1847 /*
1808 * If an RCU GP has gone long enough, go check for dyntick
1809 * idle CPUs and, if needed, send resched IPIs.
1810 */
1811 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1812 force_quiescent_state(rsp, 1);
1813
1814 /*
1815 * Advance callbacks in response to end of earlier grace 1848 * Advance callbacks in response to end of earlier grace
1816 * period that some other CPU ended. 1849 * period that some other CPU ended.
1817 */ 1850 */
@@ -1838,6 +1871,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1838{ 1871{
1839 struct rcu_state *rsp; 1872 struct rcu_state *rsp;
1840 1873
1874 if (cpu_is_offline(smp_processor_id()))
1875 return;
1841 trace_rcu_utilization("Start RCU core"); 1876 trace_rcu_utilization("Start RCU core");
1842 for_each_rcu_flavor(rsp) 1877 for_each_rcu_flavor(rsp)
1843 __rcu_process_callbacks(rsp); 1878 __rcu_process_callbacks(rsp);
@@ -1909,12 +1944,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1909 rdp->blimit = LONG_MAX; 1944 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap && 1945 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head) 1946 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0); 1947 force_quiescent_state(rsp);
1913 rdp->n_force_qs_snap = rsp->n_force_qs; 1948 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen; 1949 rdp->qlen_last_fqs_check = rdp->qlen;
1915 } 1950 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 1951 }
1917 force_quiescent_state(rsp, 1);
1918} 1952}
1919 1953
1920static void 1954static void
@@ -1929,8 +1963,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1929 head->func = func; 1963 head->func = func;
1930 head->next = NULL; 1964 head->next = NULL;
1931 1965
1932 smp_mb(); /* Ensure RCU update seen before callback registry. */
1933
1934 /* 1966 /*
1935 * Opportunistically note grace-period endings and beginnings. 1967 * Opportunistically note grace-period endings and beginnings.
1936 * Note that we might see a beginning right after we see an 1968 * Note that we might see a beginning right after we see an
@@ -1941,6 +1973,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1941 rdp = this_cpu_ptr(rsp->rda); 1973 rdp = this_cpu_ptr(rsp->rda);
1942 1974
1943 /* Add the callback to our list. */ 1975 /* Add the callback to our list. */
1976 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) {
1977 /* _call_rcu() is illegal on offline CPU; leak the callback. */
1978 WARN_ON_ONCE(1);
1979 local_irq_restore(flags);
1980 return;
1981 }
1944 ACCESS_ONCE(rdp->qlen)++; 1982 ACCESS_ONCE(rdp->qlen)++;
1945 if (lazy) 1983 if (lazy)
1946 rdp->qlen_lazy++; 1984 rdp->qlen_lazy++;
@@ -2195,17 +2233,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2195 /* Is the RCU core waiting for a quiescent state from this CPU? */ 2233 /* Is the RCU core waiting for a quiescent state from this CPU? */
2196 if (rcu_scheduler_fully_active && 2234 if (rcu_scheduler_fully_active &&
2197 rdp->qs_pending && !rdp->passed_quiesce) { 2235 rdp->qs_pending && !rdp->passed_quiesce) {
2198
2199 /*
2200 * If force_quiescent_state() coming soon and this CPU
2201 * needs a quiescent state, and this is either RCU-sched
2202 * or RCU-bh, force a local reschedule.
2203 */
2204 rdp->n_rp_qs_pending++; 2236 rdp->n_rp_qs_pending++;
2205 if (!rdp->preemptible &&
2206 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
2207 jiffies))
2208 set_need_resched();
2209 } else if (rdp->qs_pending && rdp->passed_quiesce) { 2237 } else if (rdp->qs_pending && rdp->passed_quiesce) {
2210 rdp->n_rp_report_qs++; 2238 rdp->n_rp_report_qs++;
2211 return 1; 2239 return 1;
@@ -2235,13 +2263,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2235 return 1; 2263 return 1;
2236 } 2264 }
2237 2265
2238 /* Has an RCU GP gone long enough to send resched IPIs &c? */
2239 if (rcu_gp_in_progress(rsp) &&
2240 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
2241 rdp->n_rp_need_fqs++;
2242 return 1;
2243 }
2244
2245 /* nothing to do */ 2266 /* nothing to do */
2246 rdp->n_rp_need_nothing++; 2267 rdp->n_rp_need_nothing++;
2247 return 0; 2268 return 0;
@@ -2326,13 +2347,10 @@ static void rcu_barrier_func(void *type)
2326static void _rcu_barrier(struct rcu_state *rsp) 2347static void _rcu_barrier(struct rcu_state *rsp)
2327{ 2348{
2328 int cpu; 2349 int cpu;
2329 unsigned long flags;
2330 struct rcu_data *rdp; 2350 struct rcu_data *rdp;
2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); 2351 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done; 2352 unsigned long snap_done;
2334 2353
2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap); 2354 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2337 2355
2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2356 /* Take mutex to serialize concurrent rcu_barrier() requests. */
@@ -2372,70 +2390,30 @@ static void _rcu_barrier(struct rcu_state *rsp)
2372 /* 2390 /*
2373 * Initialize the count to one rather than to zero in order to 2391 * Initialize the count to one rather than to zero in order to
2374 * avoid a too-soon return to zero in case of a short grace period 2392 * avoid a too-soon return to zero in case of a short grace period
2375 * (or preemption of this task). Also flag this task as doing 2393 * (or preemption of this task). Exclude CPU-hotplug operations
2376 * an rcu_barrier(). This will prevent anyone else from adopting 2394 * to ensure that no offline CPU has callbacks queued.
2377 * orphaned callbacks, which could cause otherwise failure if a
2378 * CPU went offline and quickly came back online. To see this,
2379 * consider the following sequence of events:
2380 *
2381 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2382 * 2. CPU 1 goes offline, orphaning its callbacks.
2383 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2384 * 4. CPU 1 comes back online.
2385 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2388 */ 2395 */
2389 init_completion(&rsp->barrier_completion); 2396 init_completion(&rsp->barrier_completion);
2390 atomic_set(&rsp->barrier_cpu_count, 1); 2397 atomic_set(&rsp->barrier_cpu_count, 1);
2391 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2398 get_online_cpus();
2392 rsp->rcu_barrier_in_progress = current;
2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2394 2399
2395 /* 2400 /*
2396 * Force every CPU with callbacks to register a new callback 2401 * Force each CPU with callbacks to register a new callback.
2397 * that will tell us when all the preceding callbacks have 2402 * When that callback is invoked, we will know that all of the
2398 * been invoked. If an offline CPU has callbacks, wait for 2403 * corresponding CPU's preceding callbacks have been invoked.
2399 * it to either come back online or to finish orphaning those
2400 * callbacks.
2401 */ 2404 */
2402 for_each_possible_cpu(cpu) { 2405 for_each_online_cpu(cpu) {
2403 preempt_disable();
2404 rdp = per_cpu_ptr(rsp->rda, cpu); 2406 rdp = per_cpu_ptr(rsp->rda, cpu);
2405 if (cpu_is_offline(cpu)) { 2407 if (ACCESS_ONCE(rdp->qlen)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu,
2407 rsp->n_barrier_done);
2408 preempt_enable();
2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2410 schedule_timeout_interruptible(1);
2411 } else if (ACCESS_ONCE(rdp->qlen)) {
2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2408 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2413 rsp->n_barrier_done); 2409 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2410 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2415 preempt_enable();
2416 } else { 2411 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 2412 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done); 2413 rsp->n_barrier_done);
2419 preempt_enable();
2420 } 2414 }
2421 } 2415 }
2422 2416 put_online_cpus();
2423 /*
2424 * Now that all online CPUs have rcu_barrier_callback() callbacks
2425 * posted, we can adopt all of the orphaned callbacks and place
2426 * an rcu_barrier_callback() callback after them. When that is done,
2427 * we are guaranteed to have an rcu_barrier_callback() callback
2428 * following every callback that could possibly have been
2429 * registered before _rcu_barrier() was called.
2430 */
2431 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2432 rcu_adopt_orphan_cbs(rsp);
2433 rsp->rcu_barrier_in_progress = NULL;
2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2435 atomic_inc(&rsp->barrier_cpu_count);
2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2439 2417
2440 /* 2418 /*
2441 * Now that we have an rcu_barrier_callback() callback on each 2419 * Now that we have an rcu_barrier_callback() callback on each
@@ -2456,8 +2434,6 @@ static void _rcu_barrier(struct rcu_state *rsp)
2456 2434
2457 /* Other rcu_barrier() invocations can now safely proceed. */ 2435 /* Other rcu_barrier() invocations can now safely proceed. */
2458 mutex_unlock(&rsp->barrier_mutex); 2436 mutex_unlock(&rsp->barrier_mutex);
2459
2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2461} 2437}
2462 2438
2463/** 2439/**
@@ -2523,6 +2499,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2523 rdp->qlen_last_fqs_check = 0; 2499 rdp->qlen_last_fqs_check = 0;
2524 rdp->n_force_qs_snap = rsp->n_force_qs; 2500 rdp->n_force_qs_snap = rsp->n_force_qs;
2525 rdp->blimit = blimit; 2501 rdp->blimit = blimit;
2502 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2526 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2503 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2527 atomic_set(&rdp->dynticks->dynticks, 2504 atomic_set(&rdp->dynticks->dynticks,
2528 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2505 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
@@ -2555,7 +2532,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2555 rdp->completed = rnp->completed; 2532 rdp->completed = rnp->completed;
2556 rdp->passed_quiesce = 0; 2533 rdp->passed_quiesce = 0;
2557 rdp->qs_pending = 0; 2534 rdp->qs_pending = 0;
2558 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
2559 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2535 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
2560 } 2536 }
2561 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2537 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
@@ -2594,12 +2570,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2594 break; 2570 break;
2595 case CPU_ONLINE: 2571 case CPU_ONLINE:
2596 case CPU_DOWN_FAILED: 2572 case CPU_DOWN_FAILED:
2597 rcu_node_kthread_setaffinity(rnp, -1); 2573 rcu_boost_kthread_setaffinity(rnp, -1);
2598 rcu_cpu_kthread_setrt(cpu, 1);
2599 break; 2574 break;
2600 case CPU_DOWN_PREPARE: 2575 case CPU_DOWN_PREPARE:
2601 rcu_node_kthread_setaffinity(rnp, cpu); 2576 rcu_boost_kthread_setaffinity(rnp, cpu);
2602 rcu_cpu_kthread_setrt(cpu, 0);
2603 break; 2577 break;
2604 case CPU_DYING: 2578 case CPU_DYING:
2605 case CPU_DYING_FROZEN: 2579 case CPU_DYING_FROZEN:
@@ -2627,6 +2601,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2627} 2601}
2628 2602
2629/* 2603/*
2604 * Spawn the kthread that handles this RCU flavor's grace periods.
2605 */
2606static int __init rcu_spawn_gp_kthread(void)
2607{
2608 unsigned long flags;
2609 struct rcu_node *rnp;
2610 struct rcu_state *rsp;
2611 struct task_struct *t;
2612
2613 for_each_rcu_flavor(rsp) {
2614 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
2615 BUG_ON(IS_ERR(t));
2616 rnp = rcu_get_root(rsp);
2617 raw_spin_lock_irqsave(&rnp->lock, flags);
2618 rsp->gp_kthread = t;
2619 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2620 }
2621 return 0;
2622}
2623early_initcall(rcu_spawn_gp_kthread);
2624
2625/*
2630 * This function is invoked towards the end of the scheduler's initialization 2626 * This function is invoked towards the end of the scheduler's initialization
2631 * process. Before this is called, the idle task might contain 2627 * process. Before this is called, the idle task might contain
2632 * RCU read-side critical sections (during which time, this idle 2628 * RCU read-side critical sections (during which time, this idle
@@ -2661,7 +2657,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2661 int cprv; 2657 int cprv;
2662 int i; 2658 int i;
2663 2659
2664 cprv = NR_CPUS; 2660 cprv = nr_cpu_ids;
2665 for (i = rcu_num_lvls - 1; i >= 0; i--) { 2661 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2666 ccur = rsp->levelcnt[i]; 2662 ccur = rsp->levelcnt[i];
2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2663 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
@@ -2676,10 +2672,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2676static void __init rcu_init_one(struct rcu_state *rsp, 2672static void __init rcu_init_one(struct rcu_state *rsp,
2677 struct rcu_data __percpu *rda) 2673 struct rcu_data __percpu *rda)
2678{ 2674{
2679 static char *buf[] = { "rcu_node_level_0", 2675 static char *buf[] = { "rcu_node_0",
2680 "rcu_node_level_1", 2676 "rcu_node_1",
2681 "rcu_node_level_2", 2677 "rcu_node_2",
2682 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ 2678 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
2679 static char *fqs[] = { "rcu_node_fqs_0",
2680 "rcu_node_fqs_1",
2681 "rcu_node_fqs_2",
2682 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
2683 int cpustride = 1; 2683 int cpustride = 1;
2684 int i; 2684 int i;
2685 int j; 2685 int j;
@@ -2704,7 +2704,11 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2704 raw_spin_lock_init(&rnp->lock); 2704 raw_spin_lock_init(&rnp->lock);
2705 lockdep_set_class_and_name(&rnp->lock, 2705 lockdep_set_class_and_name(&rnp->lock,
2706 &rcu_node_class[i], buf[i]); 2706 &rcu_node_class[i], buf[i]);
2707 rnp->gpnum = 0; 2707 raw_spin_lock_init(&rnp->fqslock);
2708 lockdep_set_class_and_name(&rnp->fqslock,
2709 &rcu_fqs_class[i], fqs[i]);
2710 rnp->gpnum = rsp->gpnum;
2711 rnp->completed = rsp->completed;
2708 rnp->qsmask = 0; 2712 rnp->qsmask = 0;
2709 rnp->qsmaskinit = 0; 2713 rnp->qsmaskinit = 0;
2710 rnp->grplo = j * cpustride; 2714 rnp->grplo = j * cpustride;
@@ -2727,6 +2731,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2727 } 2731 }
2728 2732
2729 rsp->rda = rda; 2733 rsp->rda = rda;
2734 init_waitqueue_head(&rsp->gp_wq);
2730 rnp = rsp->level[rcu_num_lvls - 1]; 2735 rnp = rsp->level[rcu_num_lvls - 1];
2731 for_each_possible_cpu(i) { 2736 for_each_possible_cpu(i) {
2732 while (i > rnp->grphi) 2737 while (i > rnp->grphi)
@@ -2750,7 +2755,8 @@ static void __init rcu_init_geometry(void)
2750 int rcu_capacity[MAX_RCU_LVLS + 1]; 2755 int rcu_capacity[MAX_RCU_LVLS + 1];
2751 2756
2752 /* If the compile-time values are accurate, just leave. */ 2757 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) 2758 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
2759 nr_cpu_ids == NR_CPUS)
2754 return; 2760 return;
2755 2761
2756 /* 2762 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4d29169f2124..7576fd4d8ce6 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -196,12 +196,7 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 196 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 197 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 198#endif /* #ifdef CONFIG_RCU_BOOST */
199 struct task_struct *node_kthread_task; 199 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200 /* kthread that takes care of this rcu_node */
201 /* structure, for example, awakening the */
202 /* per-CPU kthreads as needed. */
203 unsigned int node_kthread_status;
204 /* State of node_kthread_task for tracing. */
205} ____cacheline_internodealigned_in_smp; 200} ____cacheline_internodealigned_in_smp;
206 201
207/* 202/*
@@ -245,8 +240,6 @@ struct rcu_data {
245 /* in order to detect GP end. */ 240 /* in order to detect GP end. */
246 unsigned long gpnum; /* Highest gp number that this CPU */ 241 unsigned long gpnum; /* Highest gp number that this CPU */
247 /* is aware of having started. */ 242 /* is aware of having started. */
248 unsigned long passed_quiesce_gpnum;
249 /* gpnum at time of quiescent state. */
250 bool passed_quiesce; /* User-mode/idle loop etc. */ 243 bool passed_quiesce; /* User-mode/idle loop etc. */
251 bool qs_pending; /* Core waits for quiesc state. */ 244 bool qs_pending; /* Core waits for quiesc state. */
252 bool beenonline; /* CPU online at least once. */ 245 bool beenonline; /* CPU online at least once. */
@@ -312,11 +305,13 @@ struct rcu_data {
312 unsigned long n_rp_cpu_needs_gp; 305 unsigned long n_rp_cpu_needs_gp;
313 unsigned long n_rp_gp_completed; 306 unsigned long n_rp_gp_completed;
314 unsigned long n_rp_gp_started; 307 unsigned long n_rp_gp_started;
315 unsigned long n_rp_need_fqs;
316 unsigned long n_rp_need_nothing; 308 unsigned long n_rp_need_nothing;
317 309
318 /* 6) _rcu_barrier() callback. */ 310 /* 6) _rcu_barrier() and OOM callbacks. */
319 struct rcu_head barrier_head; 311 struct rcu_head barrier_head;
312#ifdef CONFIG_RCU_FAST_NO_HZ
313 struct rcu_head oom_head;
314#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
320 315
321 int cpu; 316 int cpu;
322 struct rcu_state *rsp; 317 struct rcu_state *rsp;
@@ -375,20 +370,17 @@ struct rcu_state {
375 370
376 u8 fqs_state ____cacheline_internodealigned_in_smp; 371 u8 fqs_state ____cacheline_internodealigned_in_smp;
377 /* Force QS state. */ 372 /* Force QS state. */
378 u8 fqs_active; /* force_quiescent_state() */
379 /* is running. */
380 u8 fqs_need_gp; /* A CPU was prevented from */
381 /* starting a new grace */
382 /* period because */
383 /* force_quiescent_state() */
384 /* was running. */
385 u8 boost; /* Subject to priority boost. */ 373 u8 boost; /* Subject to priority boost. */
386 unsigned long gpnum; /* Current gp number. */ 374 unsigned long gpnum; /* Current gp number. */
387 unsigned long completed; /* # of last completed gp. */ 375 unsigned long completed; /* # of last completed gp. */
376 struct task_struct *gp_kthread; /* Task for grace periods. */
377 wait_queue_head_t gp_wq; /* Where GP task waits. */
378 int gp_flags; /* Commands for GP task. */
388 379
389 /* End of fields guarded by root rcu_node's lock. */ 380 /* End of fields guarded by root rcu_node's lock. */
390 381
391 raw_spinlock_t onofflock; /* exclude on/offline and */ 382 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp;
383 /* exclude on/offline and */
392 /* starting new GP. */ 384 /* starting new GP. */
393 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 385 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
394 /* need a grace period. */ 386 /* need a grace period. */
@@ -398,16 +390,11 @@ struct rcu_state {
398 struct rcu_head **orphan_donetail; /* Tail of above. */ 390 struct rcu_head **orphan_donetail; /* Tail of above. */
399 long qlen_lazy; /* Number of lazy callbacks. */ 391 long qlen_lazy; /* Number of lazy callbacks. */
400 long qlen; /* Total number of callbacks. */ 392 long qlen; /* Total number of callbacks. */
401 struct task_struct *rcu_barrier_in_progress;
402 /* Task doing rcu_barrier(), */
403 /* or NULL if no barrier. */
404 struct mutex barrier_mutex; /* Guards barrier fields. */ 393 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 394 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */ 395 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */ 396 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */ 397 /* _rcu_barrier(). */
409 raw_spinlock_t fqslock; /* Only one task forcing */
410 /* quiescent states. */
411 unsigned long jiffies_force_qs; /* Time at which to invoke */ 398 unsigned long jiffies_force_qs; /* Time at which to invoke */
412 /* force_quiescent_state(). */ 399 /* force_quiescent_state(). */
413 unsigned long n_force_qs; /* Number of calls to */ 400 unsigned long n_force_qs; /* Number of calls to */
@@ -426,6 +413,10 @@ struct rcu_state {
426 struct list_head flavors; /* List of RCU flavors. */ 413 struct list_head flavors; /* List of RCU flavors. */
427}; 414};
428 415
416/* Values for rcu_state structure's gp_flags field. */
417#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
418#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
419
429extern struct list_head rcu_struct_flavors; 420extern struct list_head rcu_struct_flavors;
430#define for_each_rcu_flavor(rsp) \ 421#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 422 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
@@ -468,7 +459,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
468#ifdef CONFIG_HOTPLUG_CPU 459#ifdef CONFIG_HOTPLUG_CPU
469static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 460static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
470 unsigned long flags); 461 unsigned long flags);
471static void rcu_stop_cpu_kthread(int cpu);
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 462#endif /* #ifdef CONFIG_HOTPLUG_CPU */
473static void rcu_print_detail_task_stall(struct rcu_state *rsp); 463static void rcu_print_detail_task_stall(struct rcu_state *rsp);
474static int rcu_print_task_stall(struct rcu_node *rnp); 464static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +481,9 @@ static void invoke_rcu_callbacks_kthread(void);
491static bool rcu_is_callbacks_kthread(void); 481static bool rcu_is_callbacks_kthread(void);
492#ifdef CONFIG_RCU_BOOST 482#ifdef CONFIG_RCU_BOOST
493static void rcu_preempt_do_callbacks(void); 483static void rcu_preempt_do_callbacks(void);
494static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
495 cpumask_var_t cm);
496static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 484static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
497 struct rcu_node *rnp, 485 struct rcu_node *rnp);
498 int rnp_index);
499static void invoke_rcu_node_kthread(struct rcu_node *rnp);
500static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
501#endif /* #ifdef CONFIG_RCU_BOOST */ 486#endif /* #ifdef CONFIG_RCU_BOOST */
502static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
503static void __cpuinit rcu_prepare_kthreads(int cpu); 487static void __cpuinit rcu_prepare_kthreads(int cpu);
504static void rcu_prepare_for_idle_init(int cpu); 488static void rcu_prepare_for_idle_init(int cpu);
505static void rcu_cleanup_after_idle(int cpu); 489static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c0df01..9c71c1b18e03 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,8 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/oom.h>
29#include <linux/smpboot.h>
28 30
29#define RCU_KTHREAD_PRIO 1 31#define RCU_KTHREAD_PRIO 1
30 32
@@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
118 */ 120 */
119void rcu_force_quiescent_state(void) 121void rcu_force_quiescent_state(void)
120{ 122{
121 force_quiescent_state(&rcu_preempt_state, 0); 123 force_quiescent_state(&rcu_preempt_state);
122} 124}
123EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 125EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
124 126
@@ -136,8 +138,6 @@ static void rcu_preempt_qs(int cpu)
136{ 138{
137 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 139 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
138 140
139 rdp->passed_quiesce_gpnum = rdp->gpnum;
140 barrier();
141 if (rdp->passed_quiesce == 0) 141 if (rdp->passed_quiesce == 0)
142 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 142 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
143 rdp->passed_quiesce = 1; 143 rdp->passed_quiesce = 1;
@@ -422,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
422 unsigned long flags; 422 unsigned long flags;
423 struct task_struct *t; 423 struct task_struct *t;
424 424
425 if (!rcu_preempt_blocked_readers_cgp(rnp))
426 return;
427 raw_spin_lock_irqsave(&rnp->lock, flags); 425 raw_spin_lock_irqsave(&rnp->lock, flags);
426 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
427 raw_spin_unlock_irqrestore(&rnp->lock, flags);
428 return;
429 }
428 t = list_entry(rnp->gp_tasks, 430 t = list_entry(rnp->gp_tasks,
429 struct task_struct, rcu_node_entry); 431 struct task_struct, rcu_node_entry);
430 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 432 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -584,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
584 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 586 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
585 } 587 }
586 588
589 rnp->gp_tasks = NULL;
590 rnp->exp_tasks = NULL;
587#ifdef CONFIG_RCU_BOOST 591#ifdef CONFIG_RCU_BOOST
588 /* In case root is being boosted and leaf is not. */ 592 rnp->boost_tasks = NULL;
593 /*
594 * In case root is being boosted and leaf was not. Make sure
595 * that we boost the tasks blocking the current grace period
596 * in this case.
597 */
589 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 598 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
590 if (rnp_root->boost_tasks != NULL && 599 if (rnp_root->boost_tasks != NULL &&
591 rnp_root->boost_tasks != rnp_root->gp_tasks) 600 rnp_root->boost_tasks != rnp_root->gp_tasks &&
601 rnp_root->boost_tasks != rnp_root->exp_tasks)
592 rnp_root->boost_tasks = rnp_root->gp_tasks; 602 rnp_root->boost_tasks = rnp_root->gp_tasks;
593 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 603 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
594#endif /* #ifdef CONFIG_RCU_BOOST */ 604#endif /* #ifdef CONFIG_RCU_BOOST */
595 605
596 rnp->gp_tasks = NULL;
597 rnp->exp_tasks = NULL;
598 return retval; 606 return retval;
599} 607}
600 608
@@ -676,7 +684,7 @@ void synchronize_rcu(void)
676EXPORT_SYMBOL_GPL(synchronize_rcu); 684EXPORT_SYMBOL_GPL(synchronize_rcu);
677 685
678static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 686static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
679static long sync_rcu_preempt_exp_count; 687static unsigned long sync_rcu_preempt_exp_count;
680static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 688static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
681 689
682/* 690/*
@@ -791,7 +799,7 @@ void synchronize_rcu_expedited(void)
791 unsigned long flags; 799 unsigned long flags;
792 struct rcu_node *rnp; 800 struct rcu_node *rnp;
793 struct rcu_state *rsp = &rcu_preempt_state; 801 struct rcu_state *rsp = &rcu_preempt_state;
794 long snap; 802 unsigned long snap;
795 int trycount = 0; 803 int trycount = 0;
796 804
797 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 805 smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -799,33 +807,47 @@ void synchronize_rcu_expedited(void)
799 smp_mb(); /* Above access cannot bleed into critical section. */ 807 smp_mb(); /* Above access cannot bleed into critical section. */
800 808
801 /* 809 /*
810 * Block CPU-hotplug operations. This means that any CPU-hotplug
811 * operation that finds an rcu_node structure with tasks in the
812 * process of being boosted will know that all tasks blocking
813 * this expedited grace period will already be in the process of
814 * being boosted. This simplifies the process of moving tasks
815 * from leaf to root rcu_node structures.
816 */
817 get_online_cpus();
818
819 /*
802 * Acquire lock, falling back to synchronize_rcu() if too many 820 * Acquire lock, falling back to synchronize_rcu() if too many
803 * lock-acquisition failures. Of course, if someone does the 821 * lock-acquisition failures. Of course, if someone does the
804 * expedited grace period for us, just leave. 822 * expedited grace period for us, just leave.
805 */ 823 */
806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 824 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
825 if (ULONG_CMP_LT(snap,
826 ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
827 put_online_cpus();
828 goto mb_ret; /* Others did our work for us. */
829 }
807 if (trycount++ < 10) { 830 if (trycount++ < 10) {
808 udelay(trycount * num_online_cpus()); 831 udelay(trycount * num_online_cpus());
809 } else { 832 } else {
833 put_online_cpus();
810 synchronize_rcu(); 834 synchronize_rcu();
811 return; 835 return;
812 } 836 }
813 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
814 goto mb_ret; /* Others did our work for us. */
815 } 837 }
816 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 838 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
839 put_online_cpus();
817 goto unlock_mb_ret; /* Others did our work for us. */ 840 goto unlock_mb_ret; /* Others did our work for us. */
841 }
818 842
819 /* force all RCU readers onto ->blkd_tasks lists. */ 843 /* force all RCU readers onto ->blkd_tasks lists. */
820 synchronize_sched_expedited(); 844 synchronize_sched_expedited();
821 845
822 raw_spin_lock_irqsave(&rsp->onofflock, flags);
823
824 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 846 /* Initialize ->expmask for all non-leaf rcu_node structures. */
825 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 847 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
826 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 848 raw_spin_lock_irqsave(&rnp->lock, flags);
827 rnp->expmask = rnp->qsmaskinit; 849 rnp->expmask = rnp->qsmaskinit;
828 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 850 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 } 851 }
830 852
831 /* Snapshot current state of ->blkd_tasks lists. */ 853 /* Snapshot current state of ->blkd_tasks lists. */
@@ -834,7 +856,7 @@ void synchronize_rcu_expedited(void)
834 if (NUM_RCU_NODES > 1) 856 if (NUM_RCU_NODES > 1)
835 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 857 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
836 858
837 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 859 put_online_cpus();
838 860
839 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 861 /* Wait for snapshotted ->blkd_tasks lists to drain. */
840 rnp = rcu_get_root(rsp); 862 rnp = rcu_get_root(rsp);
@@ -1069,6 +1091,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1069 1091
1070#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1092#endif /* #else #ifdef CONFIG_RCU_TRACE */
1071 1093
1094static void rcu_wake_cond(struct task_struct *t, int status)
1095{
1096 /*
1097 * If the thread is yielding, only wake it when this
1098 * is invoked from idle
1099 */
1100 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1101 wake_up_process(t);
1102}
1103
1072/* 1104/*
1073 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1105 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1074 * or ->boost_tasks, advancing the pointer to the next task in the 1106 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1141,17 +1173,6 @@ static int rcu_boost(struct rcu_node *rnp)
1141} 1173}
1142 1174
1143/* 1175/*
1144 * Timer handler to initiate waking up of boost kthreads that
1145 * have yielded the CPU due to excessive numbers of tasks to
1146 * boost. We wake up the per-rcu_node kthread, which in turn
1147 * will wake up the booster kthread.
1148 */
1149static void rcu_boost_kthread_timer(unsigned long arg)
1150{
1151 invoke_rcu_node_kthread((struct rcu_node *)arg);
1152}
1153
1154/*
1155 * Priority-boosting kthread. One per leaf rcu_node and one for the 1176 * Priority-boosting kthread. One per leaf rcu_node and one for the
1156 * root rcu_node. 1177 * root rcu_node.
1157 */ 1178 */
@@ -1174,8 +1195,9 @@ static int rcu_boost_kthread(void *arg)
1174 else 1195 else
1175 spincnt = 0; 1196 spincnt = 0;
1176 if (spincnt > 10) { 1197 if (spincnt > 10) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1177 trace_rcu_utilization("End boost kthread@rcu_yield"); 1199 trace_rcu_utilization("End boost kthread@rcu_yield");
1178 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1200 schedule_timeout_interruptible(2);
1179 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1201 trace_rcu_utilization("Start boost kthread@rcu_yield");
1180 spincnt = 0; 1202 spincnt = 0;
1181 } 1203 }
@@ -1191,9 +1213,9 @@ static int rcu_boost_kthread(void *arg)
1191 * kthread to start boosting them. If there is an expedited grace 1213 * kthread to start boosting them. If there is an expedited grace
1192 * period in progress, it is always time to boost. 1214 * period in progress, it is always time to boost.
1193 * 1215 *
1194 * The caller must hold rnp->lock, which this function releases, 1216 * The caller must hold rnp->lock, which this function releases.
1195 * but irqs remain disabled. The ->boost_kthread_task is immortal, 1217 * The ->boost_kthread_task is immortal, so we don't need to worry
1196 * so we don't need to worry about it going away. 1218 * about it going away.
1197 */ 1219 */
1198static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1220static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1199{ 1221{
@@ -1213,8 +1235,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1213 rnp->boost_tasks = rnp->gp_tasks; 1235 rnp->boost_tasks = rnp->gp_tasks;
1214 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1236 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1215 t = rnp->boost_kthread_task; 1237 t = rnp->boost_kthread_task;
1216 if (t != NULL) 1238 if (t)
1217 wake_up_process(t); 1239 rcu_wake_cond(t, rnp->boost_kthread_status);
1218 } else { 1240 } else {
1219 rcu_initiate_boost_trace(rnp); 1241 rcu_initiate_boost_trace(rnp);
1220 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1253,10 @@ static void invoke_rcu_callbacks_kthread(void)
1231 local_irq_save(flags); 1253 local_irq_save(flags);
1232 __this_cpu_write(rcu_cpu_has_work, 1); 1254 __this_cpu_write(rcu_cpu_has_work, 1);
1233 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1255 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1234 current != __this_cpu_read(rcu_cpu_kthread_task)) 1256 current != __this_cpu_read(rcu_cpu_kthread_task)) {
1235 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); 1257 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1258 __this_cpu_read(rcu_cpu_kthread_status));
1259 }
1236 local_irq_restore(flags); 1260 local_irq_restore(flags);
1237} 1261}
1238 1262
@@ -1245,21 +1269,6 @@ static bool rcu_is_callbacks_kthread(void)
1245 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1269 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1246} 1270}
1247 1271
1248/*
1249 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1250 * held, so no one should be messing with the existence of the boost
1251 * kthread.
1252 */
1253static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1254 cpumask_var_t cm)
1255{
1256 struct task_struct *t;
1257
1258 t = rnp->boost_kthread_task;
1259 if (t != NULL)
1260 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1261}
1262
1263#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1272#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1264 1273
1265/* 1274/*
@@ -1276,15 +1285,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1276 * Returns zero if all is well, a negated errno otherwise. 1285 * Returns zero if all is well, a negated errno otherwise.
1277 */ 1286 */
1278static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1287static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1279 struct rcu_node *rnp, 1288 struct rcu_node *rnp)
1280 int rnp_index)
1281{ 1289{
1290 int rnp_index = rnp - &rsp->node[0];
1282 unsigned long flags; 1291 unsigned long flags;
1283 struct sched_param sp; 1292 struct sched_param sp;
1284 struct task_struct *t; 1293 struct task_struct *t;
1285 1294
1286 if (&rcu_preempt_state != rsp) 1295 if (&rcu_preempt_state != rsp)
1287 return 0; 1296 return 0;
1297
1298 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1299 return 0;
1300
1288 rsp->boost = 1; 1301 rsp->boost = 1;
1289 if (rnp->boost_kthread_task != NULL) 1302 if (rnp->boost_kthread_task != NULL)
1290 return 0; 1303 return 0;
@@ -1301,25 +1314,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1301 return 0; 1314 return 0;
1302} 1315}
1303 1316
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306/*
1307 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1308 */
1309static void rcu_stop_cpu_kthread(int cpu)
1310{
1311 struct task_struct *t;
1312
1313 /* Stop the CPU's kthread. */
1314 t = per_cpu(rcu_cpu_kthread_task, cpu);
1315 if (t != NULL) {
1316 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1317 kthread_stop(t);
1318 }
1319}
1320
1321#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1322
1323static void rcu_kthread_do_work(void) 1317static void rcu_kthread_do_work(void)
1324{ 1318{
1325 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1319 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1321,22 @@ static void rcu_kthread_do_work(void)
1327 rcu_preempt_do_callbacks(); 1321 rcu_preempt_do_callbacks();
1328} 1322}
1329 1323
1330/* 1324static void rcu_cpu_kthread_setup(unsigned int cpu)
1331 * Wake up the specified per-rcu_node-structure kthread.
1332 * Because the per-rcu_node kthreads are immortal, we don't need
1333 * to do anything to keep them alive.
1334 */
1335static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1336{
1337 struct task_struct *t;
1338
1339 t = rnp->node_kthread_task;
1340 if (t != NULL)
1341 wake_up_process(t);
1342}
1343
1344/*
1345 * Set the specified CPU's kthread to run RT or not, as specified by
1346 * the to_rt argument. The CPU-hotplug locks are held, so the task
1347 * is not going away.
1348 */
1349static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1350{ 1325{
1351 int policy;
1352 struct sched_param sp; 1326 struct sched_param sp;
1353 struct task_struct *t;
1354
1355 t = per_cpu(rcu_cpu_kthread_task, cpu);
1356 if (t == NULL)
1357 return;
1358 if (to_rt) {
1359 policy = SCHED_FIFO;
1360 sp.sched_priority = RCU_KTHREAD_PRIO;
1361 } else {
1362 policy = SCHED_NORMAL;
1363 sp.sched_priority = 0;
1364 }
1365 sched_setscheduler_nocheck(t, policy, &sp);
1366}
1367 1327
1368/* 1328 sp.sched_priority = RCU_KTHREAD_PRIO;
1369 * Timer handler to initiate the waking up of per-CPU kthreads that 1329 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1370 * have yielded the CPU due to excess numbers of RCU callbacks.
1371 * We wake up the per-rcu_node kthread, which in turn will wake up
1372 * the booster kthread.
1373 */
1374static void rcu_cpu_kthread_timer(unsigned long arg)
1375{
1376 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1377 struct rcu_node *rnp = rdp->mynode;
1378
1379 atomic_or(rdp->grpmask, &rnp->wakemask);
1380 invoke_rcu_node_kthread(rnp);
1381} 1330}
1382 1331
1383/* 1332static void rcu_cpu_kthread_park(unsigned int cpu)
1384 * Drop to non-real-time priority and yield, but only after posting a
1385 * timer that will cause us to regain our real-time priority if we
1386 * remain preempted. Either way, we restore our real-time priority
1387 * before returning.
1388 */
1389static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1390{ 1333{
1391 struct sched_param sp; 1334 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1392 struct timer_list yield_timer;
1393 int prio = current->rt_priority;
1394
1395 setup_timer_on_stack(&yield_timer, f, arg);
1396 mod_timer(&yield_timer, jiffies + 2);
1397 sp.sched_priority = 0;
1398 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1399 set_user_nice(current, 19);
1400 schedule();
1401 set_user_nice(current, 0);
1402 sp.sched_priority = prio;
1403 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1404 del_timer(&yield_timer);
1405} 1335}
1406 1336
1407/* 1337static int rcu_cpu_kthread_should_run(unsigned int cpu)
1408 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1409 * This can happen while the corresponding CPU is either coming online
1410 * or going offline. We cannot wait until the CPU is fully online
1411 * before starting the kthread, because the various notifier functions
1412 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1413 * the corresponding CPU is online.
1414 *
1415 * Return 1 if the kthread needs to stop, 0 otherwise.
1416 *
1417 * Caller must disable bh. This function can momentarily enable it.
1418 */
1419static int rcu_cpu_kthread_should_stop(int cpu)
1420{ 1338{
1421 while (cpu_is_offline(cpu) || 1339 return __get_cpu_var(rcu_cpu_has_work);
1422 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1423 smp_processor_id() != cpu) {
1424 if (kthread_should_stop())
1425 return 1;
1426 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1427 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1428 local_bh_enable();
1429 schedule_timeout_uninterruptible(1);
1430 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1431 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1432 local_bh_disable();
1433 }
1434 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1435 return 0;
1436} 1340}
1437 1341
1438/* 1342/*
@@ -1440,138 +1344,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1440 * RCU softirq used in flavors and configurations of RCU that do not 1344 * RCU softirq used in flavors and configurations of RCU that do not
1441 * support RCU priority boosting. 1345 * support RCU priority boosting.
1442 */ 1346 */
1443static int rcu_cpu_kthread(void *arg) 1347static void rcu_cpu_kthread(unsigned int cpu)
1444{ 1348{
1445 int cpu = (int)(long)arg; 1349 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
1446 unsigned long flags; 1350 char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
1447 int spincnt = 0; 1351 int spincnt;
1448 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1449 char work;
1450 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1451 1352
1452 trace_rcu_utilization("Start CPU kthread@init"); 1353 for (spincnt = 0; spincnt < 10; spincnt++) {
1453 for (;;) {
1454 *statusp = RCU_KTHREAD_WAITING;
1455 trace_rcu_utilization("End CPU kthread@rcu_wait");
1456 rcu_wait(*workp != 0 || kthread_should_stop());
1457 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1354 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1458 local_bh_disable(); 1355 local_bh_disable();
1459 if (rcu_cpu_kthread_should_stop(cpu)) {
1460 local_bh_enable();
1461 break;
1462 }
1463 *statusp = RCU_KTHREAD_RUNNING; 1356 *statusp = RCU_KTHREAD_RUNNING;
1464 per_cpu(rcu_cpu_kthread_loops, cpu)++; 1357 this_cpu_inc(rcu_cpu_kthread_loops);
1465 local_irq_save(flags); 1358 local_irq_disable();
1466 work = *workp; 1359 work = *workp;
1467 *workp = 0; 1360 *workp = 0;
1468 local_irq_restore(flags); 1361 local_irq_enable();
1469 if (work) 1362 if (work)
1470 rcu_kthread_do_work(); 1363 rcu_kthread_do_work();
1471 local_bh_enable(); 1364 local_bh_enable();
1472 if (*workp != 0) 1365 if (*workp == 0) {
1473 spincnt++; 1366 trace_rcu_utilization("End CPU kthread@rcu_wait");
1474 else 1367 *statusp = RCU_KTHREAD_WAITING;
1475 spincnt = 0; 1368 return;
1476 if (spincnt > 10) {
1477 *statusp = RCU_KTHREAD_YIELDING;
1478 trace_rcu_utilization("End CPU kthread@rcu_yield");
1479 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1480 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1481 spincnt = 0;
1482 }
1483 }
1484 *statusp = RCU_KTHREAD_STOPPED;
1485 trace_rcu_utilization("End CPU kthread@term");
1486 return 0;
1487}
1488
1489/*
1490 * Spawn a per-CPU kthread, setting up affinity and priority.
1491 * Because the CPU hotplug lock is held, no other CPU will be attempting
1492 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1493 * attempting to access it during boot, but the locking in kthread_bind()
1494 * will enforce sufficient ordering.
1495 *
1496 * Please note that we cannot simply refuse to wake up the per-CPU
1497 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1498 * which can result in softlockup complaints if the task ends up being
1499 * idle for more than a couple of minutes.
1500 *
1501 * However, please note also that we cannot bind the per-CPU kthread to its
1502 * CPU until that CPU is fully online. We also cannot wait until the
1503 * CPU is fully online before we create its per-CPU kthread, as this would
1504 * deadlock the system when CPU notifiers tried waiting for grace
1505 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1506 * is online. If its CPU is not yet fully online, then the code in
1507 * rcu_cpu_kthread() will wait until it is fully online, and then do
1508 * the binding.
1509 */
1510static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1511{
1512 struct sched_param sp;
1513 struct task_struct *t;
1514
1515 if (!rcu_scheduler_fully_active ||
1516 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1517 return 0;
1518 t = kthread_create_on_node(rcu_cpu_kthread,
1519 (void *)(long)cpu,
1520 cpu_to_node(cpu),
1521 "rcuc/%d", cpu);
1522 if (IS_ERR(t))
1523 return PTR_ERR(t);
1524 if (cpu_online(cpu))
1525 kthread_bind(t, cpu);
1526 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1527 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1528 sp.sched_priority = RCU_KTHREAD_PRIO;
1529 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1530 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1531 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1532 return 0;
1533}
1534
1535/*
1536 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1537 * kthreads when needed. We ignore requests to wake up kthreads
1538 * for offline CPUs, which is OK because force_quiescent_state()
1539 * takes care of this case.
1540 */
1541static int rcu_node_kthread(void *arg)
1542{
1543 int cpu;
1544 unsigned long flags;
1545 unsigned long mask;
1546 struct rcu_node *rnp = (struct rcu_node *)arg;
1547 struct sched_param sp;
1548 struct task_struct *t;
1549
1550 for (;;) {
1551 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1552 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1553 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1554 raw_spin_lock_irqsave(&rnp->lock, flags);
1555 mask = atomic_xchg(&rnp->wakemask, 0);
1556 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1557 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1558 if ((mask & 0x1) == 0)
1559 continue;
1560 preempt_disable();
1561 t = per_cpu(rcu_cpu_kthread_task, cpu);
1562 if (!cpu_online(cpu) || t == NULL) {
1563 preempt_enable();
1564 continue;
1565 }
1566 per_cpu(rcu_cpu_has_work, cpu) = 1;
1567 sp.sched_priority = RCU_KTHREAD_PRIO;
1568 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1569 preempt_enable();
1570 } 1369 }
1571 } 1370 }
1572 /* NOTREACHED */ 1371 *statusp = RCU_KTHREAD_YIELDING;
1573 rnp->node_kthread_status = RCU_KTHREAD_STOPPED; 1372 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1574 return 0; 1373 schedule_timeout_interruptible(2);
1374 trace_rcu_utilization("End CPU kthread@rcu_yield");
1375 *statusp = RCU_KTHREAD_WAITING;
1575} 1376}
1576 1377
1577/* 1378/*
@@ -1583,17 +1384,17 @@ static int rcu_node_kthread(void *arg)
1583 * no outgoing CPU. If there are no CPUs left in the affinity set, 1384 * no outgoing CPU. If there are no CPUs left in the affinity set,
1584 * this function allows the kthread to execute on any CPU. 1385 * this function allows the kthread to execute on any CPU.
1585 */ 1386 */
1586static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1387static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1587{ 1388{
1389 struct task_struct *t = rnp->boost_kthread_task;
1390 unsigned long mask = rnp->qsmaskinit;
1588 cpumask_var_t cm; 1391 cpumask_var_t cm;
1589 int cpu; 1392 int cpu;
1590 unsigned long mask = rnp->qsmaskinit;
1591 1393
1592 if (rnp->node_kthread_task == NULL) 1394 if (!t)
1593 return; 1395 return;
1594 if (!alloc_cpumask_var(&cm, GFP_KERNEL)) 1396 if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1595 return; 1397 return;
1596 cpumask_clear(cm);
1597 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1398 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1598 if ((mask & 0x1) && cpu != outgoingcpu) 1399 if ((mask & 0x1) && cpu != outgoingcpu)
1599 cpumask_set_cpu(cpu, cm); 1400 cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1404,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1603 cpumask_clear_cpu(cpu, cm); 1404 cpumask_clear_cpu(cpu, cm);
1604 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1405 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1605 } 1406 }
1606 set_cpus_allowed_ptr(rnp->node_kthread_task, cm); 1407 set_cpus_allowed_ptr(t, cm);
1607 rcu_boost_kthread_setaffinity(rnp, cm);
1608 free_cpumask_var(cm); 1408 free_cpumask_var(cm);
1609} 1409}
1610 1410
1611/* 1411static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1612 * Spawn a per-rcu_node kthread, setting priority and affinity. 1412 .store = &rcu_cpu_kthread_task,
1613 * Called during boot before online/offline can happen, or, if 1413 .thread_should_run = rcu_cpu_kthread_should_run,
1614 * during runtime, with the main CPU-hotplug locks held. So only 1414 .thread_fn = rcu_cpu_kthread,
1615 * one of these can be executing at a time. 1415 .thread_comm = "rcuc/%u",
1616 */ 1416 .setup = rcu_cpu_kthread_setup,
1617static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, 1417 .park = rcu_cpu_kthread_park,
1618 struct rcu_node *rnp) 1418};
1619{
1620 unsigned long flags;
1621 int rnp_index = rnp - &rsp->node[0];
1622 struct sched_param sp;
1623 struct task_struct *t;
1624
1625 if (!rcu_scheduler_fully_active ||
1626 rnp->qsmaskinit == 0)
1627 return 0;
1628 if (rnp->node_kthread_task == NULL) {
1629 t = kthread_create(rcu_node_kthread, (void *)rnp,
1630 "rcun/%d", rnp_index);
1631 if (IS_ERR(t))
1632 return PTR_ERR(t);
1633 raw_spin_lock_irqsave(&rnp->lock, flags);
1634 rnp->node_kthread_task = t;
1635 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1636 sp.sched_priority = 99;
1637 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1638 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1639 }
1640 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1641}
1642 1419
1643/* 1420/*
1644 * Spawn all kthreads -- called as soon as the scheduler is running. 1421 * Spawn all kthreads -- called as soon as the scheduler is running.
1645 */ 1422 */
1646static int __init rcu_spawn_kthreads(void) 1423static int __init rcu_spawn_kthreads(void)
1647{ 1424{
1648 int cpu;
1649 struct rcu_node *rnp; 1425 struct rcu_node *rnp;
1426 int cpu;
1650 1427
1651 rcu_scheduler_fully_active = 1; 1428 rcu_scheduler_fully_active = 1;
1652 for_each_possible_cpu(cpu) { 1429 for_each_possible_cpu(cpu)
1653 per_cpu(rcu_cpu_has_work, cpu) = 0; 1430 per_cpu(rcu_cpu_has_work, cpu) = 0;
1654 if (cpu_online(cpu)) 1431 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1655 (void)rcu_spawn_one_cpu_kthread(cpu);
1656 }
1657 rnp = rcu_get_root(rcu_state); 1432 rnp = rcu_get_root(rcu_state);
1658 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1433 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1659 if (NUM_RCU_NODES > 1) { 1434 if (NUM_RCU_NODES > 1) {
1660 rcu_for_each_leaf_node(rcu_state, rnp) 1435 rcu_for_each_leaf_node(rcu_state, rnp)
1661 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1436 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1662 } 1437 }
1663 return 0; 1438 return 0;
1664} 1439}
@@ -1670,11 +1445,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1670 struct rcu_node *rnp = rdp->mynode; 1445 struct rcu_node *rnp = rdp->mynode;
1671 1446
1672 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1447 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1673 if (rcu_scheduler_fully_active) { 1448 if (rcu_scheduler_fully_active)
1674 (void)rcu_spawn_one_cpu_kthread(cpu); 1449 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1675 if (rnp->node_kthread_task == NULL)
1676 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1677 }
1678} 1450}
1679 1451
1680#else /* #ifdef CONFIG_RCU_BOOST */ 1452#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1470,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1698{ 1470{
1699} 1471}
1700 1472
1701#ifdef CONFIG_HOTPLUG_CPU 1473static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1702
1703static void rcu_stop_cpu_kthread(int cpu)
1704{
1705}
1706
1707#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1708
1709static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1710{
1711}
1712
1713static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1714{ 1474{
1715} 1475}
1716 1476
@@ -2075,16 +1835,16 @@ static void rcu_prepare_for_idle(int cpu)
2075#ifdef CONFIG_TREE_PREEMPT_RCU 1835#ifdef CONFIG_TREE_PREEMPT_RCU
2076 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 1836 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2077 rcu_preempt_qs(cpu); 1837 rcu_preempt_qs(cpu);
2078 force_quiescent_state(&rcu_preempt_state, 0); 1838 force_quiescent_state(&rcu_preempt_state);
2079 } 1839 }
2080#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1840#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2081 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 1841 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2082 rcu_sched_qs(cpu); 1842 rcu_sched_qs(cpu);
2083 force_quiescent_state(&rcu_sched_state, 0); 1843 force_quiescent_state(&rcu_sched_state);
2084 } 1844 }
2085 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 1845 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2086 rcu_bh_qs(cpu); 1846 rcu_bh_qs(cpu);
2087 force_quiescent_state(&rcu_bh_state, 0); 1847 force_quiescent_state(&rcu_bh_state);
2088 } 1848 }
2089 1849
2090 /* 1850 /*
@@ -2112,6 +1872,88 @@ static void rcu_idle_count_callbacks_posted(void)
2112 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); 1872 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2113} 1873}
2114 1874
1875/*
1876 * Data for flushing lazy RCU callbacks at OOM time.
1877 */
1878static atomic_t oom_callback_count;
1879static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1880
1881/*
1882 * RCU OOM callback -- decrement the outstanding count and deliver the
1883 * wake-up if we are the last one.
1884 */
1885static void rcu_oom_callback(struct rcu_head *rhp)
1886{
1887 if (atomic_dec_and_test(&oom_callback_count))
1888 wake_up(&oom_callback_wq);
1889}
1890
1891/*
1892 * Post an rcu_oom_notify callback on the current CPU if it has at
1893 * least one lazy callback. This will unnecessarily post callbacks
1894 * to CPUs that already have a non-lazy callback at the end of their
1895 * callback list, but this is an infrequent operation, so accept some
1896 * extra overhead to keep things simple.
1897 */
1898static void rcu_oom_notify_cpu(void *unused)
1899{
1900 struct rcu_state *rsp;
1901 struct rcu_data *rdp;
1902
1903 for_each_rcu_flavor(rsp) {
1904 rdp = __this_cpu_ptr(rsp->rda);
1905 if (rdp->qlen_lazy != 0) {
1906 atomic_inc(&oom_callback_count);
1907 rsp->call(&rdp->oom_head, rcu_oom_callback);
1908 }
1909 }
1910}
1911
1912/*
1913 * If low on memory, ensure that each CPU has a non-lazy callback.
1914 * This will wake up CPUs that have only lazy callbacks, in turn
1915 * ensuring that they free up the corresponding memory in a timely manner.
1916 * Because an uncertain amount of memory will be freed in some uncertain
1917 * timeframe, we do not claim to have freed anything.
1918 */
1919static int rcu_oom_notify(struct notifier_block *self,
1920 unsigned long notused, void *nfreed)
1921{
1922 int cpu;
1923
1924 /* Wait for callbacks from earlier instance to complete. */
1925 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1926
1927 /*
1928 * Prevent premature wakeup: ensure that all increments happen
1929 * before there is a chance of the counter reaching zero.
1930 */
1931 atomic_set(&oom_callback_count, 1);
1932
1933 get_online_cpus();
1934 for_each_online_cpu(cpu) {
1935 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1936 cond_resched();
1937 }
1938 put_online_cpus();
1939
1940 /* Unconditionally decrement: no need to wake ourselves up. */
1941 atomic_dec(&oom_callback_count);
1942
1943 return NOTIFY_OK;
1944}
1945
1946static struct notifier_block rcu_oom_nb = {
1947 .notifier_call = rcu_oom_notify
1948};
1949
1950static int __init rcu_register_oom_notifier(void)
1951{
1952 register_oom_notifier(&rcu_oom_nb);
1953 return 0;
1954}
1955early_initcall(rcu_register_oom_notifier);
1956
2115#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1957#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2116 1958
2117#ifdef CONFIG_RCU_CPU_STALL_INFO 1959#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2122,11 +1964,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2122{ 1964{
2123 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1965 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2124 struct timer_list *tltp = &rdtp->idle_gp_timer; 1966 struct timer_list *tltp = &rdtp->idle_gp_timer;
1967 char c;
2125 1968
2126 sprintf(cp, "drain=%d %c timer=%lu", 1969 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
2127 rdtp->dyntick_drain, 1970 if (timer_pending(tltp))
2128 rdtp->dyntick_holdoff == jiffies ? 'H' : '.', 1971 sprintf(cp, "drain=%d %c timer=%lu",
2129 timer_pending(tltp) ? tltp->expires - jiffies : -1); 1972 rdtp->dyntick_drain, c, tltp->expires - jiffies);
1973 else
1974 sprintf(cp, "drain=%d %c timer not pending",
1975 rdtp->dyntick_drain, c);
2130} 1976}
2131 1977
2132#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1978#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2194,11 +2040,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2194/* Increment ->ticks_this_gp for all flavors of RCU. */ 2040/* Increment ->ticks_this_gp for all flavors of RCU. */
2195static void increment_cpu_stall_ticks(void) 2041static void increment_cpu_stall_ticks(void)
2196{ 2042{
2197 __get_cpu_var(rcu_sched_data).ticks_this_gp++; 2043 struct rcu_state *rsp;
2198 __get_cpu_var(rcu_bh_data).ticks_this_gp++; 2044
2199#ifdef CONFIG_TREE_PREEMPT_RCU 2045 for_each_rcu_flavor(rsp)
2200 __get_cpu_var(rcu_preempt_data).ticks_this_gp++; 2046 __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
2201#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2202} 2047}
2203 2048
2204#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2049#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index abffb486e94e..693513bc50e6 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused)
51 struct rcu_state *rsp; 51 struct rcu_state *rsp;
52 52
53 for_each_rcu_flavor(rsp) 53 for_each_rcu_flavor(rsp)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", 54 seq_printf(m, "%s: bcc: %d nbd: %lu\n",
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', 55 rsp->name,
56 atomic_read(&rsp->barrier_cpu_count), 56 atomic_read(&rsp->barrier_cpu_count),
57 rsp->n_barrier_done); 57 rsp->n_barrier_done);
58 return 0; 58 return 0;
@@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 86{
87 if (!rdp->beenonline) 87 if (!rdp->beenonline)
88 return; 88 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", 89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d",
90 rdp->cpu, 90 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 91 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 92 rdp->completed, rdp->gpnum,
93 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 93 rdp->passed_quiesce, rdp->qs_pending);
94 rdp->qs_pending);
95 seq_printf(m, " dt=%d/%llx/%d df=%lu", 94 seq_printf(m, " dt=%d/%llx/%d df=%lu",
96 atomic_read(&rdp->dynticks->dynticks), 95 atomic_read(&rdp->dynticks->dynticks),
97 rdp->dynticks->dynticks_nesting, 96 rdp->dynticks->dynticks_nesting,
@@ -108,11 +107,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
108 rdp->nxttail[RCU_WAIT_TAIL]], 107 rdp->nxttail[RCU_WAIT_TAIL]],
109 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); 108 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
110#ifdef CONFIG_RCU_BOOST 109#ifdef CONFIG_RCU_BOOST
111 seq_printf(m, " kt=%d/%c/%d ktl=%x", 110 seq_printf(m, " kt=%d/%c ktl=%x",
112 per_cpu(rcu_cpu_has_work, rdp->cpu), 111 per_cpu(rcu_cpu_has_work, rdp->cpu),
113 convert_kthread_status(per_cpu(rcu_cpu_kthread_status, 112 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
114 rdp->cpu)), 113 rdp->cpu)),
115 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
116 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
117#endif /* #ifdef CONFIG_RCU_BOOST */ 115#endif /* #ifdef CONFIG_RCU_BOOST */
118 seq_printf(m, " b=%ld", rdp->blimit); 116 seq_printf(m, " b=%ld", rdp->blimit);
@@ -150,12 +148,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
150{ 148{
151 if (!rdp->beenonline) 149 if (!rdp->beenonline)
152 return; 150 return;
153 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", 151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
154 rdp->cpu, 152 rdp->cpu,
155 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
156 rdp->completed, rdp->gpnum, 154 rdp->completed, rdp->gpnum,
157 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 155 rdp->passed_quiesce, rdp->qs_pending);
158 rdp->qs_pending);
159 seq_printf(m, ",%d,%llx,%d,%lu", 156 seq_printf(m, ",%d,%llx,%d,%lu",
160 atomic_read(&rdp->dynticks->dynticks), 157 atomic_read(&rdp->dynticks->dynticks),
161 rdp->dynticks->dynticks_nesting, 158 rdp->dynticks->dynticks_nesting,
@@ -186,7 +183,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
186 int cpu; 183 int cpu;
187 struct rcu_state *rsp; 184 struct rcu_state *rsp;
188 185
189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\",");
190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
192#ifdef CONFIG_RCU_BOOST 189#ifdef CONFIG_RCU_BOOST
@@ -386,10 +383,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
386 rdp->n_rp_report_qs, 383 rdp->n_rp_report_qs,
387 rdp->n_rp_cb_ready, 384 rdp->n_rp_cb_ready,
388 rdp->n_rp_cpu_needs_gp); 385 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", 386 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
390 rdp->n_rp_gp_completed, 387 rdp->n_rp_gp_completed,
391 rdp->n_rp_gp_started, 388 rdp->n_rp_gp_started,
392 rdp->n_rp_need_fqs,
393 rdp->n_rp_need_nothing); 389 rdp->n_rp_need_nothing);
394} 390}
395 391
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 649c9f876cb1..1a48cdbc8631 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5604,7 +5604,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5604 migrate_tasks(cpu); 5604 migrate_tasks(cpu);
5605 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5605 BUG_ON(rq->nr_running != 1); /* the migration thread */
5606 raw_spin_unlock_irqrestore(&rq->lock, flags); 5606 raw_spin_unlock_irqrestore(&rq->lock, flags);
5607 break;
5607 5608
5609 case CPU_DEAD:
5608 calc_load_migrate(rq); 5610 calc_load_migrate(rq);
5609 break; 5611 break;
5610#endif 5612#endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1b..d6c5fc054242 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
1/* 1/*
2 * Common SMP CPU bringup/teardown functions 2 * Common SMP CPU bringup/teardown functions
3 */ 3 */
4#include <linux/cpu.h>
4#include <linux/err.h> 5#include <linux/err.h>
5#include <linux/smp.h> 6#include <linux/smp.h>
6#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/list.h>
9#include <linux/slab.h>
7#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/export.h>
8#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/kthread.h>
14#include <linux/smpboot.h>
9 15
10#include "smpboot.h" 16#include "smpboot.h"
11 17
18#ifdef CONFIG_SMP
19
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 20#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/* 21/*
14 * For the hotplug case we keep the task structs around and reuse 22 * For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
65 } 73 }
66} 74}
67#endif 75#endif
76
77#endif /* #ifdef CONFIG_SMP */
78
79static LIST_HEAD(hotplug_threads);
80static DEFINE_MUTEX(smpboot_threads_lock);
81
82struct smpboot_thread_data {
83 unsigned int cpu;
84 unsigned int status;
85 struct smp_hotplug_thread *ht;
86};
87
88enum {
89 HP_THREAD_NONE = 0,
90 HP_THREAD_ACTIVE,
91 HP_THREAD_PARKED,
92};
93
94/**
95 * smpboot_thread_fn - percpu hotplug thread loop function
96 * @data: thread data pointer
97 *
98 * Checks for thread stop and park conditions. Calls the necessary
99 * setup, cleanup, park and unpark functions for the registered
100 * thread.
101 *
102 * Returns 1 when the thread should exit, 0 otherwise.
103 */
104static int smpboot_thread_fn(void *data)
105{
106 struct smpboot_thread_data *td = data;
107 struct smp_hotplug_thread *ht = td->ht;
108
109 while (1) {
110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable();
112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING);
114 preempt_enable();
115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu));
117 kfree(td);
118 return 0;
119 }
120
121 if (kthread_should_park()) {
122 __set_current_state(TASK_RUNNING);
123 preempt_enable();
124 if (ht->park && td->status == HP_THREAD_ACTIVE) {
125 BUG_ON(td->cpu != smp_processor_id());
126 ht->park(td->cpu);
127 td->status = HP_THREAD_PARKED;
128 }
129 kthread_parkme();
130 /* We might have been woken for stop */
131 continue;
132 }
133
134 BUG_ON(td->cpu != smp_processor_id());
135
136 /* Check for state change setup */
137 switch (td->status) {
138 case HP_THREAD_NONE:
139 preempt_enable();
140 if (ht->setup)
141 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE;
143 preempt_disable();
144 break;
145 case HP_THREAD_PARKED:
146 preempt_enable();
147 if (ht->unpark)
148 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE;
150 preempt_disable();
151 break;
152 }
153
154 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable();
156 schedule();
157 } else {
158 set_current_state(TASK_RUNNING);
159 preempt_enable();
160 ht->thread_fn(td->cpu);
161 }
162 }
163}
164
165static int
166__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
167{
168 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
169 struct smpboot_thread_data *td;
170
171 if (tsk)
172 return 0;
173
174 td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
175 if (!td)
176 return -ENOMEM;
177 td->cpu = cpu;
178 td->ht = ht;
179
180 tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
181 ht->thread_comm);
182 if (IS_ERR(tsk)) {
183 kfree(td);
184 return PTR_ERR(tsk);
185 }
186
187 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk;
189 return 0;
190}
191
192int smpboot_create_threads(unsigned int cpu)
193{
194 struct smp_hotplug_thread *cur;
195 int ret = 0;
196
197 mutex_lock(&smpboot_threads_lock);
198 list_for_each_entry(cur, &hotplug_threads, list) {
199 ret = __smpboot_create_thread(cur, cpu);
200 if (ret)
201 break;
202 }
203 mutex_unlock(&smpboot_threads_lock);
204 return ret;
205}
206
207static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
208{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210
211 kthread_unpark(tsk);
212}
213
214void smpboot_unpark_threads(unsigned int cpu)
215{
216 struct smp_hotplug_thread *cur;
217
218 mutex_lock(&smpboot_threads_lock);
219 list_for_each_entry(cur, &hotplug_threads, list)
220 smpboot_unpark_thread(cur, cpu);
221 mutex_unlock(&smpboot_threads_lock);
222}
223
224static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227
228 if (tsk)
229 kthread_park(tsk);
230}
231
232void smpboot_park_threads(unsigned int cpu)
233{
234 struct smp_hotplug_thread *cur;
235
236 mutex_lock(&smpboot_threads_lock);
237 list_for_each_entry_reverse(cur, &hotplug_threads, list)
238 smpboot_park_thread(cur, cpu);
239 mutex_unlock(&smpboot_threads_lock);
240}
241
242static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
243{
244 unsigned int cpu;
245
246 /* We need to destroy also the parked threads of offline cpus */
247 for_each_possible_cpu(cpu) {
248 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
249
250 if (tsk) {
251 kthread_stop(tsk);
252 put_task_struct(tsk);
253 *per_cpu_ptr(ht->store, cpu) = NULL;
254 }
255 }
256}
257
258/**
259 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
260 * @plug_thread: Hotplug thread descriptor
261 *
262 * Creates and starts the threads on all online cpus.
263 */
264int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
265{
266 unsigned int cpu;
267 int ret = 0;
268
269 mutex_lock(&smpboot_threads_lock);
270 for_each_online_cpu(cpu) {
271 ret = __smpboot_create_thread(plug_thread, cpu);
272 if (ret) {
273 smpboot_destroy_threads(plug_thread);
274 goto out;
275 }
276 smpboot_unpark_thread(plug_thread, cpu);
277 }
278 list_add(&plug_thread->list, &hotplug_threads);
279out:
280 mutex_unlock(&smpboot_threads_lock);
281 return ret;
282}
283EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
284
285/**
286 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
287 * @plug_thread: Hotplug thread descriptor
288 *
289 * Stops all threads on all possible cpus.
290 */
291void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
292{
293 get_online_cpus();
294 mutex_lock(&smpboot_threads_lock);
295 list_del(&plug_thread->list);
296 smpboot_destroy_threads(plug_thread);
297 mutex_unlock(&smpboot_threads_lock);
298 put_online_cpus();
299}
300EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6ef9433e1c70..72415a0eb955 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
13static inline void idle_threads_init(void) { } 13static inline void idle_threads_init(void) { }
14#endif 14#endif
15 15
16int smpboot_create_threads(unsigned int cpu);
17void smpboot_park_threads(unsigned int cpu);
18void smpboot_unpark_threads(unsigned int cpu);
19
16#endif 20#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b73e681df09e..5c6a5bd8462f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smpboot.h>
26#include <linux/tick.h> 27#include <linux/tick.h>
27 28
28#define CREATE_TRACE_POINTS 29#define CREATE_TRACE_POINTS
@@ -742,49 +743,22 @@ void __init softirq_init(void)
742 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 743 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
743} 744}
744 745
745static int run_ksoftirqd(void * __bind_cpu) 746static int ksoftirqd_should_run(unsigned int cpu)
746{ 747{
747 set_current_state(TASK_INTERRUPTIBLE); 748 return local_softirq_pending();
748 749}
749 while (!kthread_should_stop()) {
750 preempt_disable();
751 if (!local_softirq_pending()) {
752 schedule_preempt_disabled();
753 }
754
755 __set_current_state(TASK_RUNNING);
756
757 while (local_softirq_pending()) {
758 /* Preempt disable stops cpu going offline.
759 If already offline, we'll be on wrong CPU:
760 don't process */
761 if (cpu_is_offline((long)__bind_cpu))
762 goto wait_to_die;
763 local_irq_disable();
764 if (local_softirq_pending())
765 __do_softirq();
766 local_irq_enable();
767 sched_preempt_enable_no_resched();
768 cond_resched();
769 preempt_disable();
770 rcu_note_context_switch((long)__bind_cpu);
771 }
772 preempt_enable();
773 set_current_state(TASK_INTERRUPTIBLE);
774 }
775 __set_current_state(TASK_RUNNING);
776 return 0;
777 750
778wait_to_die: 751static void run_ksoftirqd(unsigned int cpu)
779 preempt_enable(); 752{
780 /* Wait for kthread_stop */ 753 local_irq_disable();
781 set_current_state(TASK_INTERRUPTIBLE); 754 if (local_softirq_pending()) {
782 while (!kthread_should_stop()) { 755 __do_softirq();
783 schedule(); 756 rcu_note_context_switch(cpu);
784 set_current_state(TASK_INTERRUPTIBLE); 757 local_irq_enable();
758 cond_resched();
759 return;
785 } 760 }
786 __set_current_state(TASK_RUNNING); 761 local_irq_enable();
787 return 0;
788} 762}
789 763
790#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
850 unsigned long action, 824 unsigned long action,
851 void *hcpu) 825 void *hcpu)
852{ 826{
853 int hotcpu = (unsigned long)hcpu;
854 struct task_struct *p;
855
856 switch (action) { 827 switch (action) {
857 case CPU_UP_PREPARE:
858 case CPU_UP_PREPARE_FROZEN:
859 p = kthread_create_on_node(run_ksoftirqd,
860 hcpu,
861 cpu_to_node(hotcpu),
862 "ksoftirqd/%d", hotcpu);
863 if (IS_ERR(p)) {
864 printk("ksoftirqd for %i failed\n", hotcpu);
865 return notifier_from_errno(PTR_ERR(p));
866 }
867 kthread_bind(p, hotcpu);
868 per_cpu(ksoftirqd, hotcpu) = p;
869 break;
870 case CPU_ONLINE:
871 case CPU_ONLINE_FROZEN:
872 wake_up_process(per_cpu(ksoftirqd, hotcpu));
873 break;
874#ifdef CONFIG_HOTPLUG_CPU 828#ifdef CONFIG_HOTPLUG_CPU
875 case CPU_UP_CANCELED:
876 case CPU_UP_CANCELED_FROZEN:
877 if (!per_cpu(ksoftirqd, hotcpu))
878 break;
879 /* Unbind so it can run. Fall thru. */
880 kthread_bind(per_cpu(ksoftirqd, hotcpu),
881 cpumask_any(cpu_online_mask));
882 case CPU_DEAD: 829 case CPU_DEAD:
883 case CPU_DEAD_FROZEN: { 830 case CPU_DEAD_FROZEN:
884 static const struct sched_param param = { 831 takeover_tasklets((unsigned long)hcpu);
885 .sched_priority = MAX_RT_PRIO-1
886 };
887
888 p = per_cpu(ksoftirqd, hotcpu);
889 per_cpu(ksoftirqd, hotcpu) = NULL;
890 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
891 kthread_stop(p);
892 takeover_tasklets(hotcpu);
893 break; 832 break;
894 }
895#endif /* CONFIG_HOTPLUG_CPU */ 833#endif /* CONFIG_HOTPLUG_CPU */
896 } 834 }
897 return NOTIFY_OK; 835 return NOTIFY_OK;
898} 836}
899 837
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
901 .notifier_call = cpu_callback 839 .notifier_call = cpu_callback
902}; 840};
903 841
842static struct smp_hotplug_thread softirq_threads = {
843 .store = &ksoftirqd,
844 .thread_should_run = ksoftirqd_should_run,
845 .thread_fn = run_ksoftirqd,
846 .thread_comm = "ksoftirqd/%u",
847};
848
904static __init int spawn_ksoftirqd(void) 849static __init int spawn_ksoftirqd(void)
905{ 850{
906 void *cpu = (void *)(long)smp_processor_id();
907 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
908
909 BUG_ON(err != NOTIFY_OK);
910 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
911 register_cpu_notifier(&cpu_nfb); 851 register_cpu_notifier(&cpu_nfb);
852
853 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
854
912 return 0; 855 return 0;
913} 856}
914early_initcall(spawn_ksoftirqd); 857early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3a9e5d5c1091..cf5f6b262673 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
437 static int ratelimit; 437 static int ratelimit;
438 438
439 if (ratelimit < 10) { 439 if (ratelimit < 10 &&
440 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
440 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 441 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
441 (unsigned int) local_softirq_pending()); 442 (unsigned int) local_softirq_pending());
442 ratelimit++; 443 ratelimit++;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..9d4c8d5a1f53 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h>
25 26
26#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
27#include <linux/kvm_para.h> 28#include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
29 30
30int watchdog_enabled = 1; 31int watchdog_enabled = 1;
31int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled;
32 34
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 37static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync); 38static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn); 39static DEFINE_PER_CPU(bool, soft_watchdog_warn);
40static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
41static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR 42#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn); 43static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 44static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 45static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 46static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 47#endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
248 __this_cpu_write(hard_watchdog_warn, false); 251 __this_cpu_write(hard_watchdog_warn, false);
249 return; 252 return;
250} 253}
254#endif /* CONFIG_HARDLOCKUP_DETECTOR */
255
251static void watchdog_interrupt_count(void) 256static void watchdog_interrupt_count(void)
252{ 257{
253 __this_cpu_inc(hrtimer_interrupts); 258 __this_cpu_inc(hrtimer_interrupts);
254} 259}
255#else 260
256static inline void watchdog_interrupt_count(void) { return; } 261static int watchdog_nmi_enable(unsigned int cpu);
257#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 262static void watchdog_nmi_disable(unsigned int cpu);
258 263
259/* watchdog kicker functions */ 264/* watchdog kicker functions */
260static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 265static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 return HRTIMER_RESTART; 332 return HRTIMER_RESTART;
328} 333}
329 334
335static void watchdog_set_prio(unsigned int policy, unsigned int prio)
336{
337 struct sched_param param = { .sched_priority = prio };
330 338
331/* 339 sched_setscheduler(current, policy, &param);
332 * The watchdog thread - touches the timestamp. 340}
333 */ 341
334static int watchdog(void *unused) 342static void watchdog_enable(unsigned int cpu)
335{ 343{
336 struct sched_param param = { .sched_priority = 0 };
337 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
338 345
339 /* initialize timestamp */ 346 if (!watchdog_enabled) {
340 __touch_watchdog(); 347 kthread_park(current);
348 return;
349 }
350
351 /* Enable the perf event */
352 watchdog_nmi_enable(cpu);
341 353
342 /* kick off the timer for the hardlockup detector */ 354 /* kick off the timer for the hardlockup detector */
355 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
356 hrtimer->function = watchdog_timer_fn;
357
343 /* done here because hrtimer_start can only pin to smp_processor_id() */ 358 /* done here because hrtimer_start can only pin to smp_processor_id() */
344 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
345 HRTIMER_MODE_REL_PINNED); 360 HRTIMER_MODE_REL_PINNED);
346 361
347 set_current_state(TASK_INTERRUPTIBLE); 362 /* initialize timestamp */
348 /* 363 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
349 * Run briefly (kicked by the hrtimer callback function) once every 364 __touch_watchdog();
350 * get_sample_period() seconds (4 seconds by default) to reset the 365}
351 * softlockup timestamp. If this gets delayed for more than
352 * 2*watchdog_thresh seconds then the debug-printout triggers in
353 * watchdog_timer_fn().
354 */
355 while (!kthread_should_stop()) {
356 __touch_watchdog();
357 schedule();
358 366
359 if (kthread_should_stop()) 367static void watchdog_disable(unsigned int cpu)
360 break; 368{
369 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
361 370
362 set_current_state(TASK_INTERRUPTIBLE); 371 watchdog_set_prio(SCHED_NORMAL, 0);
363 } 372 hrtimer_cancel(hrtimer);
364 /* 373 /* disable the perf event */
365 * Drop the policy/priority elevation during thread exit to avoid a 374 watchdog_nmi_disable(cpu);
366 * scheduling latency spike.
367 */
368 __set_current_state(TASK_RUNNING);
369 sched_setscheduler(current, SCHED_NORMAL, &param);
370 return 0;
371} 375}
372 376
377static int watchdog_should_run(unsigned int cpu)
378{
379 return __this_cpu_read(hrtimer_interrupts) !=
380 __this_cpu_read(soft_lockup_hrtimer_cnt);
381}
382
383/*
384 * The watchdog thread function - touches the timestamp.
385 *
386 * It only runs once every get_sample_period() seconds (4 seconds by
387 * default) to reset the softlockup timestamp. If this gets delayed
388 * for more than 2*watchdog_thresh seconds then the debug-printout
389 * triggers in watchdog_timer_fn().
390 */
391static void watchdog(unsigned int cpu)
392{
393 __this_cpu_write(soft_lockup_hrtimer_cnt,
394 __this_cpu_read(hrtimer_interrupts));
395 __touch_watchdog();
396}
373 397
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 398#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/* 399/*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
379 */ 403 */
380static unsigned long cpu0_err; 404static unsigned long cpu0_err;
381 405
382static int watchdog_nmi_enable(int cpu) 406static int watchdog_nmi_enable(unsigned int cpu)
383{ 407{
384 struct perf_event_attr *wd_attr; 408 struct perf_event_attr *wd_attr;
385 struct perf_event *event = per_cpu(watchdog_ev, cpu); 409 struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
433 return 0; 457 return 0;
434} 458}
435 459
436static void watchdog_nmi_disable(int cpu) 460static void watchdog_nmi_disable(unsigned int cpu)
437{ 461{
438 struct perf_event *event = per_cpu(watchdog_ev, cpu); 462 struct perf_event *event = per_cpu(watchdog_ev, cpu);
439 463
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
447 return; 471 return;
448} 472}
449#else 473#else
450static int watchdog_nmi_enable(int cpu) { return 0; } 474static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
451static void watchdog_nmi_disable(int cpu) { return; } 475static void watchdog_nmi_disable(unsigned int cpu) { return; }
452#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 476#endif /* CONFIG_HARDLOCKUP_DETECTOR */
453 477
454/* prepare/enable/disable routines */ 478/* prepare/enable/disable routines */
455static void watchdog_prepare_cpu(int cpu)
456{
457 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
458
459 WARN_ON(per_cpu(softlockup_watchdog, cpu));
460 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
461 hrtimer->function = watchdog_timer_fn;
462}
463
464static int watchdog_enable(int cpu)
465{
466 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
467 int err = 0;
468
469 /* enable the perf event */
470 err = watchdog_nmi_enable(cpu);
471
472 /* Regardless of err above, fall through and start softlockup */
473
474 /* create the watchdog thread */
475 if (!p) {
476 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
477 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
478 if (IS_ERR(p)) {
479 pr_err("softlockup watchdog for %i failed\n", cpu);
480 if (!err) {
481 /* if hardlockup hasn't already set this */
482 err = PTR_ERR(p);
483 /* and disable the perf event */
484 watchdog_nmi_disable(cpu);
485 }
486 goto out;
487 }
488 sched_setscheduler(p, SCHED_FIFO, &param);
489 kthread_bind(p, cpu);
490 per_cpu(watchdog_touch_ts, cpu) = 0;
491 per_cpu(softlockup_watchdog, cpu) = p;
492 wake_up_process(p);
493 }
494
495out:
496 return err;
497}
498
499static void watchdog_disable(int cpu)
500{
501 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
502 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
503
504 /*
505 * cancel the timer first to stop incrementing the stats
506 * and waking up the kthread
507 */
508 hrtimer_cancel(hrtimer);
509
510 /* disable the perf event */
511 watchdog_nmi_disable(cpu);
512
513 /* stop the watchdog thread */
514 if (p) {
515 per_cpu(softlockup_watchdog, cpu) = NULL;
516 kthread_stop(p);
517 }
518}
519
520/* sysctl functions */ 479/* sysctl functions */
521#ifdef CONFIG_SYSCTL 480#ifdef CONFIG_SYSCTL
522static void watchdog_enable_all_cpus(void) 481static void watchdog_enable_all_cpus(void)
523{ 482{
524 int cpu; 483 unsigned int cpu;
525
526 watchdog_enabled = 0;
527
528 for_each_online_cpu(cpu)
529 if (!watchdog_enable(cpu))
530 /* if any cpu succeeds, watchdog is considered
531 enabled for the system */
532 watchdog_enabled = 1;
533
534 if (!watchdog_enabled)
535 pr_err("failed to be enabled on some cpus\n");
536 484
485 if (watchdog_disabled) {
486 watchdog_disabled = 0;
487 for_each_online_cpu(cpu)
488 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
489 }
537} 490}
538 491
539static void watchdog_disable_all_cpus(void) 492static void watchdog_disable_all_cpus(void)
540{ 493{
541 int cpu; 494 unsigned int cpu;
542
543 for_each_online_cpu(cpu)
544 watchdog_disable(cpu);
545 495
546 /* if all watchdogs are disabled, then they are disabled for the system */ 496 if (!watchdog_disabled) {
547 watchdog_enabled = 0; 497 watchdog_disabled = 1;
498 for_each_online_cpu(cpu)
499 kthread_park(per_cpu(softlockup_watchdog, cpu));
500 }
548} 501}
549 502
550
551/* 503/*
552 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 504 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
553 */ 505 */
@@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
557{ 509{
558 int ret; 510 int ret;
559 511
512 if (watchdog_disabled < 0)
513 return -ENODEV;
514
560 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 515 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
561 if (ret || !write) 516 if (ret || !write)
562 goto out; 517 return ret;
563 518
564 if (watchdog_enabled && watchdog_thresh) 519 if (watchdog_enabled && watchdog_thresh)
565 watchdog_enable_all_cpus(); 520 watchdog_enable_all_cpus();
566 else 521 else
567 watchdog_disable_all_cpus(); 522 watchdog_disable_all_cpus();
568 523
569out:
570 return ret; 524 return ret;
571} 525}
572#endif /* CONFIG_SYSCTL */ 526#endif /* CONFIG_SYSCTL */
573 527
574 528static struct smp_hotplug_thread watchdog_threads = {
575/* 529 .store = &softlockup_watchdog,
576 * Create/destroy watchdog threads as CPUs come and go: 530 .thread_should_run = watchdog_should_run,
577 */ 531 .thread_fn = watchdog,
578static int __cpuinit 532 .thread_comm = "watchdog/%u",
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 533 .setup = watchdog_enable,
580{ 534 .park = watchdog_disable,
581 int hotcpu = (unsigned long)hcpu; 535 .unpark = watchdog_enable,
582
583 switch (action) {
584 case CPU_UP_PREPARE:
585 case CPU_UP_PREPARE_FROZEN:
586 watchdog_prepare_cpu(hotcpu);
587 break;
588 case CPU_ONLINE:
589 case CPU_ONLINE_FROZEN:
590 if (watchdog_enabled)
591 watchdog_enable(hotcpu);
592 break;
593#ifdef CONFIG_HOTPLUG_CPU
594 case CPU_UP_CANCELED:
595 case CPU_UP_CANCELED_FROZEN:
596 watchdog_disable(hotcpu);
597 break;
598 case CPU_DEAD:
599 case CPU_DEAD_FROZEN:
600 watchdog_disable(hotcpu);
601 break;
602#endif /* CONFIG_HOTPLUG_CPU */
603 }
604
605 /*
606 * hardlockup and softlockup are not important enough
607 * to block cpu bring up. Just always succeed and
608 * rely on printk output to flag problems.
609 */
610 return NOTIFY_OK;
611}
612
613static struct notifier_block __cpuinitdata cpu_nfb = {
614 .notifier_call = cpu_callback
615}; 536};
616 537
617void __init lockup_detector_init(void) 538void __init lockup_detector_init(void)
618{ 539{
619 void *cpu = (void *)(long)smp_processor_id(); 540 if (smpboot_register_percpu_thread(&watchdog_threads)) {
620 int err; 541 pr_err("Failed to create watchdog threads, disabled\n");
621 542 watchdog_disabled = -ENODEV;
622 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 543 }
623 WARN_ON(notifier_to_errno(err));
624
625 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
626 register_cpu_notifier(&cpu_nfb);
627
628 return;
629} 544}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2403a63b5da5..dacbbe4d7a80 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -629,6 +629,20 @@ config PROVE_RCU_REPEATEDLY
629 629
630 Say N if you are unsure. 630 Say N if you are unsure.
631 631
632config PROVE_RCU_DELAY
633 bool "RCU debugging: preemptible RCU race provocation"
634 depends on DEBUG_KERNEL && PREEMPT_RCU
635 default n
636 help
637 There is a class of races that involve an unlikely preemption
638 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
639 been set to INT_MIN. This feature inserts a delay at that
640 point to increase the probability of these races.
641
642 Say Y to increase probability of preemption of __rcu_read_unlock().
643
644 Say N if you are unsure.
645
632config SPARSE_RCU_POINTER 646config SPARSE_RCU_POINTER
633 bool "RCU debugging: sparse-based checks for pointer usage" 647 bool "RCU debugging: sparse-based checks for pointer usage"
634 default n 648 default n
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..0de83b4541e9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1483,13 +1483,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1483{ 1483{
1484 struct kmemleak_object *prev_obj = v; 1484 struct kmemleak_object *prev_obj = v;
1485 struct kmemleak_object *next_obj = NULL; 1485 struct kmemleak_object *next_obj = NULL;
1486 struct list_head *n = &prev_obj->object_list; 1486 struct kmemleak_object *obj = prev_obj;
1487 1487
1488 ++(*pos); 1488 ++(*pos);
1489 1489
1490 list_for_each_continue_rcu(n, &object_list) { 1490 list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
1491 struct kmemleak_object *obj =
1492 list_entry(n, struct kmemleak_object, object_list);
1493 if (get_object(obj)) { 1491 if (get_object(obj)) {
1494 next_obj = obj; 1492 next_obj = obj;
1495 break; 1493 break;