aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-01 13:16:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-01 13:16:42 -0400
commit620e77533f29796df7aff861e79bd72e08554ebb (patch)
tree844afce2333549bc5b8d7dc87a4875b9216a0023
parent6977b4c7736e8809b7959c66875a16c0bbcf2152 (diff)
parentfa34da708cbe1e2d9a2ee7fc68ea8fccbf095d12 (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU changes from Ingo Molnar: 0. 'idle RCU': Adds RCU APIs that allow non-idle tasks to enter RCU idle mode and provides x86 code to make use of them, allowing RCU to treat user-mode execution as an extended quiescent state when the new RCU_USER_QS kernel configuration parameter is specified. (Work is in progress to port this to a few other architectures, but is not part of this series.) 1. A fix for a latent bug that has been in RCU ever since the addition of CPU stall warnings. This bug results in false-positive stall warnings, but thus far only on embedded systems with severely cut-down userspace configurations. 2. Further reductions in latency spikes for huge systems, along with additional boot-time adaptation to the actual hardware. This is a large change, as it moves RCU grace-period initialization and cleanup, along with quiescent-state forcing, from softirq to a kthread. However, it appears to be in quite good shape (famous last words). 3. Updates to documentation and rcutorture, the latter category including keeping statistics on CPU-hotplug latencies and fixing some initialization-time races. 4. CPU-hotplug fixes and improvements. 5. Idle-loop fixes that were omitted on an earlier submission. 6. Miscellaneous fixes and improvements In certain RCU configurations new kernel threads will show up (rcu_bh, rcu_sched), showing RCU processing overhead. * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (90 commits) rcu: Apply micro-optimization and int/bool fixes to RCU's idle handling rcu: Userspace RCU extended QS selftest x86: Exit RCU extended QS on notify resume x86: Use the new schedule_user API on userspace preemption rcu: Exit RCU extended QS on user preemption rcu: Exit RCU extended QS on kernel preemption after irq/exception x86: Exception hooks for userspace RCU extended QS x86: Unspaghettize do_general_protection() x86: Syscall hooks for userspace RCU extended QS rcu: Switch task's syscall hooks on context switch rcu: Ignore userspace extended quiescent state by default rcu: Allow rcu_user_enter()/exit() to nest rcu: Settle config for userspace extended quiescent state rcu: Make RCU_FAST_NO_HZ handle adaptive ticks rcu: New rcu_user_enter_after_irq() and rcu_user_exit_after_irq() APIs rcu: New rcu_user_enter() and rcu_user_exit() APIs ia64: Add missing RCU idle APIs on idle loop xtensa: Add missing RCU idle APIs on idle loop score: Add missing RCU idle APIs on idle loop parisc: Add missing RCU idle APIs on idle loop ...
-rw-r--r--Documentation/RCU/checklist.txt6
-rw-r--r--Documentation/RCU/stallwarn.txt16
-rw-r--r--Documentation/RCU/trace.txt43
-rw-r--r--Documentation/RCU/whatisRCU.txt9
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--arch/Kconfig10
-rw-r--r--arch/alpha/kernel/process.c6
-rw-r--r--arch/alpha/kernel/smp.c1
-rw-r--r--arch/cris/kernel/process.c3
-rw-r--r--arch/frv/kernel/process.c3
-rw-r--r--arch/h8300/kernel/process.c3
-rw-r--r--arch/ia64/kernel/process.c3
-rw-r--r--arch/m32r/kernel/process.c3
-rw-r--r--arch/m68k/kernel/process.c3
-rw-r--r--arch/mn10300/kernel/process.c3
-rw-r--r--arch/parisc/kernel/process.c3
-rw-r--r--arch/score/kernel/process.c4
-rw-r--r--arch/um/drivers/mconsole_kern.c1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/rcu.h32
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/entry_64.S9
-rw-r--r--arch/x86/kernel/msr.c5
-rw-r--r--arch/x86/kernel/ptrace.c5
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/traps.c109
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/xtensa/kernel/process.c3
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c250
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.h6
-rw-r--r--include/linux/interrupt.h2
-rw-r--r--include/linux/kthread.h11
-rw-r--r--include/linux/rcupdate.h21
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/smpboot.h43
-rw-r--r--include/linux/tracepoint.h28
-rw-r--r--init/Kconfig18
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/kthread.c185
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h10
-rw-r--r--kernel/rcutorture.c159
-rw-r--r--kernel/rcutree.c916
-rw-r--r--kernel/rcutree.h50
-rw-r--r--kernel/rcutree_plugin.h597
-rw-r--r--kernel/rcutree_trace.c22
-rw-r--r--kernel/sched/core.c19
-rw-r--r--kernel/smpboot.c233
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c111
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/watchdog.c263
-rw-r--r--lib/Kconfig.debug14
-rw-r--r--mm/kmemleak.c6
57 files changed, 1932 insertions, 1424 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index fc103d7a0474..cdb20d41a44a 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -310,6 +310,12 @@ over a rather long period of time, but improvements are always welcome!
310 code under the influence of preempt_disable(), you instead 310 code under the influence of preempt_disable(), you instead
311 need to use synchronize_irq() or synchronize_sched(). 311 need to use synchronize_irq() or synchronize_sched().
312 312
313 This same limitation also applies to synchronize_rcu_bh()
314 and synchronize_srcu(), as well as to the asynchronous and
315 expedited forms of the three primitives, namely call_rcu(),
316 call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
317 synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
318
31312. Any lock acquired by an RCU callback must be acquired elsewhere 31912. Any lock acquired by an RCU callback must be acquired elsewhere
314 with softirq disabled, e.g., via spin_lock_irqsave(), 320 with softirq disabled, e.g., via spin_lock_irqsave(),
315 spin_lock_bh(), etc. Failing to disable irq on a given 321 spin_lock_bh(), etc. Failing to disable irq on a given
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 523364e4e1f1..1927151b386b 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -99,7 +99,7 @@ In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99printed: 99printed:
100 100
101 INFO: rcu_preempt detected stall on CPU 101 INFO: rcu_preempt detected stall on CPU
102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer=-1 102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
103 (t=65000 jiffies) 103 (t=65000 jiffies)
104 104
105The "(64628 ticks this GP)" indicates that this CPU has taken more 105The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,13 @@ number between the two "/"s is the value of the nesting, which will
116be a small positive number if in the idle loop and a very large positive 116be a small positive number if in the idle loop and a very large positive
117number (as shown above) otherwise. 117number (as shown above) otherwise.
118 118
119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the 119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
120CPU is not in the process of trying to force itself into dyntick-idle 120not in the process of trying to force itself into dyntick-idle state, the
121state, the "." indicates that the CPU has not given up forcing RCU 121"." indicates that the CPU has not given up forcing RCU into dyntick-idle
122into dyntick-idle mode (it would be "H" otherwise), and the "timer=-1" 122mode (it would be "H" otherwise), and the "timer not pending" indicates
123indicates that the CPU has not recented forced RCU into dyntick-idle 123that the CPU has not recently forced RCU into dyntick-idle mode (it
124mode (it would otherwise indicate the number of microseconds remaining 124would otherwise indicate the number of microseconds remaining in this
125in this forced state). 125forced state).
126 126
127 127
128Multiple Warnings From One Stall 128Multiple Warnings From One Stall
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index f6f15ce39903..672d19083252 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -333,23 +333,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
333The output of "cat rcu/rcu_pending" looks as follows: 333The output of "cat rcu/rcu_pending" looks as follows:
334 334
335rcu_sched: 335rcu_sched:
336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741
337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792
338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629
339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723
340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110
341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456
342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834
343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888
344rcu_bh: 344rcu_bh:
345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314
346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180
347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
353 353
354As always, this is once again split into "rcu_sched" and "rcu_bh" 354As always, this is once again split into "rcu_sched" and "rcu_bh"
355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional 355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -377,17 +377,6 @@ o "gpc" is the number of times that an old grace period had
377o "gps" is the number of times that a new grace period had started, 377o "gps" is the number of times that a new grace period had started,
378 but this CPU was not yet aware of it. 378 but this CPU was not yet aware of it.
379 379
380o "nf" is the number of times that this CPU suspected that the
381 current grace period had run for too long, and thus needed to
382 be forced.
383
384 Please note that "forcing" consists of sending resched IPIs
385 to holdout CPUs. If that CPU really still is in an old RCU
386 read-side critical section, then we really do have to wait for it.
387 The assumption behing "forcing" is that the CPU is not still in
388 an old RCU read-side critical section, but has not yet responded
389 for some other reason.
390
391o "nn" is the number of times that this CPU needed nothing. Alert 380o "nn" is the number of times that this CPU needed nothing. Alert
392 readers will note that the rcu "nn" number for a given CPU very 381 readers will note that the rcu "nn" number for a given CPU very
393 closely matches the rcu_bh "np" number for that same CPU. This 382 closely matches the rcu_bh "np" number for that same CPU. This
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 69ee188515e7..bf0f6de2aa00 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -873,7 +873,7 @@ d. Do you need to treat NMI handlers, hardirq handlers,
873 and code segments with preemption disabled (whether 873 and code segments with preemption disabled (whether
874 via preempt_disable(), local_irq_save(), local_bh_disable(), 874 via preempt_disable(), local_irq_save(), local_bh_disable(),
875 or some other mechanism) as if they were explicit RCU readers? 875 or some other mechanism) as if they were explicit RCU readers?
876 If so, you need RCU-sched. 876 If so, RCU-sched is the only choice that will work for you.
877 877
878e. Do you need RCU grace periods to complete even in the face 878e. Do you need RCU grace periods to complete even in the face
879 of softirq monopolization of one or more of the CPUs? For 879 of softirq monopolization of one or more of the CPUs? For
@@ -884,7 +884,12 @@ f. Is your workload too update-intensive for normal use of
884 RCU, but inappropriate for other synchronization mechanisms? 884 RCU, but inappropriate for other synchronization mechanisms?
885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
886 886
887g. Otherwise, use RCU. 887g. Do you need read-side critical sections that are respected
888 even though they are in the middle of the idle loop, during
889 user-mode execution, or on an offlined CPU? If so, SRCU is the
890 only choice that will work for you.
891
892h. Otherwise, use RCU.
888 893
889Of course, this all assumes that you have determined that RCU is in fact 894Of course, this all assumes that you have determined that RCU is in fact
890the right tool for your job. 895the right tool for your job.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ad7e2e5088c1..55ada0471f93 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2385,6 +2385,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2385 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] 2385 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2386 Set timeout for RCU CPU stall warning messages. 2386 Set timeout for RCU CPU stall warning messages.
2387 2387
2388 rcutree.jiffies_till_first_fqs= [KNL,BOOT]
2389 Set delay from grace-period initialization to
2390 first attempt to force quiescent states.
2391 Units are jiffies, minimum value is zero,
2392 and maximum value is HZ.
2393
2394 rcutree.jiffies_till_next_fqs= [KNL,BOOT]
2395 Set delay between subsequent attempts to force
2396 quiescent states. Units are jiffies, minimum
2397 value is one, and maximum value is HZ.
2398
2388 rcutorture.fqs_duration= [KNL,BOOT] 2399 rcutorture.fqs_duration= [KNL,BOOT]
2389 Set duration of force_quiescent_state bursts. 2400 Set duration of force_quiescent_state bursts.
2390 2401
diff --git a/arch/Kconfig b/arch/Kconfig
index 72f2fa189cc5..1401a7587973 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -281,4 +281,14 @@ config SECCOMP_FILTER
281 281
282 See Documentation/prctl/seccomp_filter.txt for details. 282 See Documentation/prctl/seccomp_filter.txt for details.
283 283
284config HAVE_RCU_USER_QS
285 bool
286 help
287 Provide kernel entry/exit hooks necessary for userspace
288 RCU extended quiescent state. Syscalls need to be wrapped inside
289 rcu_user_exit()-rcu_user_enter() through the slow path using
290 TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs
291 are already protected inside rcu_irq_enter/rcu_irq_exit() but
292 preemption or signal handling on irq exit still need to be protected.
293
284source "kernel/gcov/Kconfig" 294source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index d6fde98b74b3..83638aa096d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -28,6 +28,7 @@
28#include <linux/tty.h> 28#include <linux/tty.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rcupdate.h>
31 32
32#include <asm/reg.h> 33#include <asm/reg.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
@@ -54,9 +55,12 @@ cpu_idle(void)
54 /* FIXME -- EV6 and LCA45 know how to power down 55 /* FIXME -- EV6 and LCA45 know how to power down
55 the CPU. */ 56 the CPU. */
56 57
58 rcu_idle_enter();
57 while (!need_resched()) 59 while (!need_resched())
58 cpu_relax(); 60 cpu_relax();
59 schedule(); 61
62 rcu_idle_exit();
63 schedule_preempt_disabled();
60 } 64 }
61} 65}
62 66
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 35ddc02bfa4a..a41ad90a97a6 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -166,6 +166,7 @@ smp_callin(void)
166 DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n", 166 DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
167 cpuid, current, current->active_mm)); 167 cpuid, current, current->active_mm));
168 168
169 preempt_disable();
169 /* Do nothing. */ 170 /* Do nothing. */
170 cpu_idle(); 171 cpu_idle();
171} 172}
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 66fd01728790..7f65be6f7f17 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/elfcore.h> 25#include <linux/elfcore.h>
26#include <linux/mqueue.h> 26#include <linux/mqueue.h>
27#include <linux/reboot.h> 27#include <linux/reboot.h>
28#include <linux/rcupdate.h>
28 29
29//#define DEBUG 30//#define DEBUG
30 31
@@ -74,6 +75,7 @@ void cpu_idle (void)
74{ 75{
75 /* endless idle loop with no priority at all */ 76 /* endless idle loop with no priority at all */
76 while (1) { 77 while (1) {
78 rcu_idle_enter();
77 while (!need_resched()) { 79 while (!need_resched()) {
78 void (*idle)(void); 80 void (*idle)(void);
79 /* 81 /*
@@ -86,6 +88,7 @@ void cpu_idle (void)
86 idle = default_idle; 88 idle = default_idle;
87 idle(); 89 idle();
88 } 90 }
91 rcu_idle_exit();
89 schedule_preempt_disabled(); 92 schedule_preempt_disabled();
90 } 93 }
91} 94}
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index ff95f50efea5..2eb7fa5bf9d8 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/rcupdate.h>
28 29
29#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -69,12 +70,14 @@ void cpu_idle(void)
69{ 70{
70 /* endless idle loop with no priority at all */ 71 /* endless idle loop with no priority at all */
71 while (1) { 72 while (1) {
73 rcu_idle_enter();
72 while (!need_resched()) { 74 while (!need_resched()) {
73 check_pgt_cache(); 75 check_pgt_cache();
74 76
75 if (!frv_dma_inprogress && idle) 77 if (!frv_dma_inprogress && idle)
76 idle(); 78 idle();
77 } 79 }
80 rcu_idle_exit();
78 81
79 schedule_preempt_disabled(); 82 schedule_preempt_disabled();
80 } 83 }
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 0e9c315be104..f153ed1a4c08 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -36,6 +36,7 @@
36#include <linux/reboot.h> 36#include <linux/reboot.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/rcupdate.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/traps.h> 42#include <asm/traps.h>
@@ -78,8 +79,10 @@ void (*idle)(void) = default_idle;
78void cpu_idle(void) 79void cpu_idle(void)
79{ 80{
80 while (1) { 81 while (1) {
82 rcu_idle_enter();
81 while (!need_resched()) 83 while (!need_resched())
82 idle(); 84 idle();
85 rcu_idle_exit();
83 schedule_preempt_disabled(); 86 schedule_preempt_disabled();
84 } 87 }
85} 88}
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index dd6fc1449741..3e316ec0b835 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -29,6 +29,7 @@
29#include <linux/kdebug.h> 29#include <linux/kdebug.h>
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/tracehook.h> 31#include <linux/tracehook.h>
32#include <linux/rcupdate.h>
32 33
33#include <asm/cpu.h> 34#include <asm/cpu.h>
34#include <asm/delay.h> 35#include <asm/delay.h>
@@ -279,6 +280,7 @@ cpu_idle (void)
279 280
280 /* endless idle loop with no priority at all */ 281 /* endless idle loop with no priority at all */
281 while (1) { 282 while (1) {
283 rcu_idle_enter();
282 if (can_do_pal_halt) { 284 if (can_do_pal_halt) {
283 current_thread_info()->status &= ~TS_POLLING; 285 current_thread_info()->status &= ~TS_POLLING;
284 /* 286 /*
@@ -309,6 +311,7 @@ cpu_idle (void)
309 normal_xtp(); 311 normal_xtp();
310#endif 312#endif
311 } 313 }
314 rcu_idle_exit();
312 schedule_preempt_disabled(); 315 schedule_preempt_disabled();
313 check_pgt_cache(); 316 check_pgt_cache();
314 if (cpu_is_offline(cpu)) 317 if (cpu_is_offline(cpu))
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 3a4a32b27208..384e63f3a4c4 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -26,6 +26,7 @@
26#include <linux/ptrace.h> 26#include <linux/ptrace.h>
27#include <linux/unistd.h> 27#include <linux/unistd.h>
28#include <linux/hardirq.h> 28#include <linux/hardirq.h>
29#include <linux/rcupdate.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
@@ -82,6 +83,7 @@ void cpu_idle (void)
82{ 83{
83 /* endless idle loop with no priority at all */ 84 /* endless idle loop with no priority at all */
84 while (1) { 85 while (1) {
86 rcu_idle_enter();
85 while (!need_resched()) { 87 while (!need_resched()) {
86 void (*idle)(void) = pm_idle; 88 void (*idle)(void) = pm_idle;
87 89
@@ -90,6 +92,7 @@ void cpu_idle (void)
90 92
91 idle(); 93 idle();
92 } 94 }
95 rcu_idle_exit();
93 schedule_preempt_disabled(); 96 schedule_preempt_disabled();
94 } 97 }
95} 98}
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index c488e3cfab53..ac2892e49c7c 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/init_task.h> 26#include <linux/init_task.h>
27#include <linux/mqueue.h> 27#include <linux/mqueue.h>
28#include <linux/rcupdate.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/traps.h> 31#include <asm/traps.h>
@@ -75,8 +76,10 @@ void cpu_idle(void)
75{ 76{
76 /* endless idle loop with no priority at all */ 77 /* endless idle loop with no priority at all */
77 while (1) { 78 while (1) {
79 rcu_idle_enter();
78 while (!need_resched()) 80 while (!need_resched())
79 idle(); 81 idle();
82 rcu_idle_exit();
80 schedule_preempt_disabled(); 83 schedule_preempt_disabled();
81 } 84 }
82} 85}
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 7dab0cd36466..e9cceba193b6 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/rcupdate.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <asm/pgtable.h> 30#include <asm/pgtable.h>
30#include <asm/io.h> 31#include <asm/io.h>
@@ -107,6 +108,7 @@ void cpu_idle(void)
107{ 108{
108 /* endless idle loop with no priority at all */ 109 /* endless idle loop with no priority at all */
109 for (;;) { 110 for (;;) {
111 rcu_idle_enter();
110 while (!need_resched()) { 112 while (!need_resched()) {
111 void (*idle)(void); 113 void (*idle)(void);
112 114
@@ -121,6 +123,7 @@ void cpu_idle(void)
121 } 123 }
122 idle(); 124 idle();
123 } 125 }
126 rcu_idle_exit();
124 127
125 schedule_preempt_disabled(); 128 schedule_preempt_disabled();
126 } 129 }
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 2c05a9292a81..8c6b6b6561f0 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -48,6 +48,7 @@
48#include <linux/unistd.h> 48#include <linux/unistd.h>
49#include <linux/kallsyms.h> 49#include <linux/kallsyms.h>
50#include <linux/uaccess.h> 50#include <linux/uaccess.h>
51#include <linux/rcupdate.h>
51 52
52#include <asm/io.h> 53#include <asm/io.h>
53#include <asm/asm-offsets.h> 54#include <asm/asm-offsets.h>
@@ -69,8 +70,10 @@ void cpu_idle(void)
69 70
70 /* endless idle loop with no priority at all */ 71 /* endless idle loop with no priority at all */
71 while (1) { 72 while (1) {
73 rcu_idle_enter();
72 while (!need_resched()) 74 while (!need_resched())
73 barrier(); 75 barrier();
76 rcu_idle_exit();
74 schedule_preempt_disabled(); 77 schedule_preempt_disabled();
75 check_pgt_cache(); 78 check_pgt_cache();
76 } 79 }
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index 2707023c7563..637970cfd3f4 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -27,6 +27,7 @@
27#include <linux/reboot.h> 27#include <linux/reboot.h>
28#include <linux/elfcore.h> 28#include <linux/elfcore.h>
29#include <linux/pm.h> 29#include <linux/pm.h>
30#include <linux/rcupdate.h>
30 31
31void (*pm_power_off)(void); 32void (*pm_power_off)(void);
32EXPORT_SYMBOL(pm_power_off); 33EXPORT_SYMBOL(pm_power_off);
@@ -50,9 +51,10 @@ void __noreturn cpu_idle(void)
50{ 51{
51 /* endless idle loop with no priority at all */ 52 /* endless idle loop with no priority at all */
52 while (1) { 53 while (1) {
54 rcu_idle_enter();
53 while (!need_resched()) 55 while (!need_resched())
54 barrier(); 56 barrier();
55 57 rcu_idle_exit();
56 schedule_preempt_disabled(); 58 schedule_preempt_disabled();
57 } 59 }
58} 60}
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 664a60e8dfb4..c17de0db6736 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -705,6 +705,7 @@ static void stack_proc(void *arg)
705 struct task_struct *from = current, *to = arg; 705 struct task_struct *from = current, *to = arg;
706 706
707 to->thread.saved_task = from; 707 to->thread.saved_task = from;
708 rcu_switch(from, to);
708 switch_to(from, to, from); 709 switch_to(from, to, from);
709} 710}
710 711
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 50a1d1f9b6d3..20c49b8450b8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -97,6 +97,7 @@ config X86
97 select KTIME_SCALAR if X86_32 97 select KTIME_SCALAR if X86_32
98 select GENERIC_STRNCPY_FROM_USER 98 select GENERIC_STRNCPY_FROM_USER
99 select GENERIC_STRNLEN_USER 99 select GENERIC_STRNLEN_USER
100 select HAVE_RCU_USER_QS if X86_64
100 101
101config INSTRUCTION_DECODER 102config INSTRUCTION_DECODER
102 def_bool (KPROBES || PERF_EVENTS || UPROBES) 103 def_bool (KPROBES || PERF_EVENTS || UPROBES)
diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h
new file mode 100644
index 000000000000..d1ac07a23979
--- /dev/null
+++ b/arch/x86/include/asm/rcu.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_RCU_H
2#define _ASM_X86_RCU_H
3
4#ifndef __ASSEMBLY__
5
6#include <linux/rcupdate.h>
7#include <asm/ptrace.h>
8
9static inline void exception_enter(struct pt_regs *regs)
10{
11 rcu_user_exit();
12}
13
14static inline void exception_exit(struct pt_regs *regs)
15{
16#ifdef CONFIG_RCU_USER_QS
17 if (user_mode(regs))
18 rcu_user_enter();
19#endif
20}
21
22#else /* __ASSEMBLY__ */
23
24#ifdef CONFIG_RCU_USER_QS
25# define SCHEDULE_USER call schedule_user
26#else
27# define SCHEDULE_USER call schedule
28#endif
29
30#endif /* !__ASSEMBLY__ */
31
32#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 89f794f007ec..c535d847e3b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -89,6 +89,7 @@ struct thread_info {
89#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 89#define TIF_NOTSC 16 /* TSC is not accessible in userland */
90#define TIF_IA32 17 /* IA32 compatibility process */ 90#define TIF_IA32 17 /* IA32 compatibility process */
91#define TIF_FORK 18 /* ret_from_fork */ 91#define TIF_FORK 18 /* ret_from_fork */
92#define TIF_NOHZ 19 /* in adaptive nohz mode */
92#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 93#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
93#define TIF_DEBUG 21 /* uses debug registers */ 94#define TIF_DEBUG 21 /* uses debug registers */
94#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 95#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -114,6 +115,7 @@ struct thread_info {
114#define _TIF_NOTSC (1 << TIF_NOTSC) 115#define _TIF_NOTSC (1 << TIF_NOTSC)
115#define _TIF_IA32 (1 << TIF_IA32) 116#define _TIF_IA32 (1 << TIF_IA32)
116#define _TIF_FORK (1 << TIF_FORK) 117#define _TIF_FORK (1 << TIF_FORK)
118#define _TIF_NOHZ (1 << TIF_NOHZ)
117#define _TIF_DEBUG (1 << TIF_DEBUG) 119#define _TIF_DEBUG (1 << TIF_DEBUG)
118#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 120#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
119#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 121#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
@@ -126,12 +128,13 @@ struct thread_info {
126/* work to do in syscall_trace_enter() */ 128/* work to do in syscall_trace_enter() */
127#define _TIF_WORK_SYSCALL_ENTRY \ 129#define _TIF_WORK_SYSCALL_ENTRY \
128 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ 130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
129 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) 131 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
132 _TIF_NOHZ)
130 133
131/* work to do in syscall_trace_leave() */ 134/* work to do in syscall_trace_leave() */
132#define _TIF_WORK_SYSCALL_EXIT \ 135#define _TIF_WORK_SYSCALL_EXIT \
133 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ 136 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
134 _TIF_SYSCALL_TRACEPOINT) 137 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
135 138
136/* work to do on interrupt/exception return */ 139/* work to do on interrupt/exception return */
137#define _TIF_WORK_MASK \ 140#define _TIF_WORK_MASK \
@@ -141,7 +144,8 @@ struct thread_info {
141 144
142/* work to do on any return to user space */ 145/* work to do on any return to user space */
143#define _TIF_ALLWORK_MASK \ 146#define _TIF_ALLWORK_MASK \
144 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) 147 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
148 _TIF_NOHZ)
145 149
146/* Only used for 64 bit */ 150/* Only used for 64 bit */
147#define _TIF_DO_NOTIFY_MASK \ 151#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 39472dd2323f..60c78917190c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -199,12 +199,14 @@ static int __init cpuid_init(void)
199 goto out_chrdev; 199 goto out_chrdev;
200 } 200 }
201 cpuid_class->devnode = cpuid_devnode; 201 cpuid_class->devnode = cpuid_devnode;
202 get_online_cpus();
202 for_each_online_cpu(i) { 203 for_each_online_cpu(i) {
203 err = cpuid_device_create(i); 204 err = cpuid_device_create(i);
204 if (err != 0) 205 if (err != 0)
205 goto out_class; 206 goto out_class;
206 } 207 }
207 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 208 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
209 put_online_cpus();
208 210
209 err = 0; 211 err = 0;
210 goto out; 212 goto out;
@@ -214,6 +216,7 @@ out_class:
214 for_each_online_cpu(i) { 216 for_each_online_cpu(i) {
215 cpuid_device_destroy(i); 217 cpuid_device_destroy(i);
216 } 218 }
219 put_online_cpus();
217 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
218out_chrdev: 221out_chrdev:
219 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 222 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)
225{ 228{
226 int cpu = 0; 229 int cpu = 0;
227 230
231 get_online_cpus();
228 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
229 cpuid_device_destroy(cpu); 233 cpuid_device_destroy(cpu);
230 class_destroy(cpuid_class); 234 class_destroy(cpuid_class);
231 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 235 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
232 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 236 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
237 put_online_cpus();
233} 238}
234 239
235module_init(cpuid_init); 240module_init(cpuid_init);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 69babd8c834f..1a8f3cbb6ee3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,7 @@
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/rcu.h>
59#include <linux/err.h> 60#include <linux/err.h>
60 61
61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 62/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -565,7 +566,7 @@ sysret_careful:
565 TRACE_IRQS_ON 566 TRACE_IRQS_ON
566 ENABLE_INTERRUPTS(CLBR_NONE) 567 ENABLE_INTERRUPTS(CLBR_NONE)
567 pushq_cfi %rdi 568 pushq_cfi %rdi
568 call schedule 569 SCHEDULE_USER
569 popq_cfi %rdi 570 popq_cfi %rdi
570 jmp sysret_check 571 jmp sysret_check
571 572
@@ -678,7 +679,7 @@ int_careful:
678 TRACE_IRQS_ON 679 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_NONE) 680 ENABLE_INTERRUPTS(CLBR_NONE)
680 pushq_cfi %rdi 681 pushq_cfi %rdi
681 call schedule 682 SCHEDULE_USER
682 popq_cfi %rdi 683 popq_cfi %rdi
683 DISABLE_INTERRUPTS(CLBR_NONE) 684 DISABLE_INTERRUPTS(CLBR_NONE)
684 TRACE_IRQS_OFF 685 TRACE_IRQS_OFF
@@ -974,7 +975,7 @@ retint_careful:
974 TRACE_IRQS_ON 975 TRACE_IRQS_ON
975 ENABLE_INTERRUPTS(CLBR_NONE) 976 ENABLE_INTERRUPTS(CLBR_NONE)
976 pushq_cfi %rdi 977 pushq_cfi %rdi
977 call schedule 978 SCHEDULE_USER
978 popq_cfi %rdi 979 popq_cfi %rdi
979 GET_THREAD_INFO(%rcx) 980 GET_THREAD_INFO(%rcx)
980 DISABLE_INTERRUPTS(CLBR_NONE) 981 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1449,7 +1450,7 @@ paranoid_userspace:
1449paranoid_schedule: 1450paranoid_schedule:
1450 TRACE_IRQS_ON 1451 TRACE_IRQS_ON
1451 ENABLE_INTERRUPTS(CLBR_ANY) 1452 ENABLE_INTERRUPTS(CLBR_ANY)
1452 call schedule 1453 SCHEDULE_USER
1453 DISABLE_INTERRUPTS(CLBR_ANY) 1454 DISABLE_INTERRUPTS(CLBR_ANY)
1454 TRACE_IRQS_OFF 1455 TRACE_IRQS_OFF
1455 jmp paranoid_userspace 1456 jmp paranoid_userspace
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f043..a7c5661f8496 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -257,12 +257,14 @@ static int __init msr_init(void)
257 goto out_chrdev; 257 goto out_chrdev;
258 } 258 }
259 msr_class->devnode = msr_devnode; 259 msr_class->devnode = msr_devnode;
260 get_online_cpus();
260 for_each_online_cpu(i) { 261 for_each_online_cpu(i) {
261 err = msr_device_create(i); 262 err = msr_device_create(i);
262 if (err != 0) 263 if (err != 0)
263 goto out_class; 264 goto out_class;
264 } 265 }
265 register_hotcpu_notifier(&msr_class_cpu_notifier); 266 register_hotcpu_notifier(&msr_class_cpu_notifier);
267 put_online_cpus();
266 268
267 err = 0; 269 err = 0;
268 goto out; 270 goto out;
@@ -271,6 +273,7 @@ out_class:
271 i = 0; 273 i = 0;
272 for_each_online_cpu(i) 274 for_each_online_cpu(i)
273 msr_device_destroy(i); 275 msr_device_destroy(i);
276 put_online_cpus();
274 class_destroy(msr_class); 277 class_destroy(msr_class);
275out_chrdev: 278out_chrdev:
276 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 279 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -281,11 +284,13 @@ out:
281static void __exit msr_exit(void) 284static void __exit msr_exit(void)
282{ 285{
283 int cpu = 0; 286 int cpu = 0;
287 get_online_cpus();
284 for_each_online_cpu(cpu) 288 for_each_online_cpu(cpu)
285 msr_device_destroy(cpu); 289 msr_device_destroy(cpu);
286 class_destroy(msr_class); 290 class_destroy(msr_class);
287 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 291 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
288 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 292 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
293 put_online_cpus();
289} 294}
290 295
291module_init(msr_init); 296module_init(msr_init);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4c6a5c2bf0f..9f94f8ec26e4 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/signal.h> 21#include <linux/signal.h>
22#include <linux/perf_event.h> 22#include <linux/perf_event.h>
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -1463,6 +1464,8 @@ long syscall_trace_enter(struct pt_regs *regs)
1463{ 1464{
1464 long ret = 0; 1465 long ret = 0;
1465 1466
1467 rcu_user_exit();
1468
1466 /* 1469 /*
1467 * If we stepped into a sysenter/syscall insn, it trapped in 1470 * If we stepped into a sysenter/syscall insn, it trapped in
1468 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 1471 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1526,4 +1529,6 @@ void syscall_trace_leave(struct pt_regs *regs)
1526 !test_thread_flag(TIF_SYSCALL_EMU); 1529 !test_thread_flag(TIF_SYSCALL_EMU);
1527 if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1530 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1528 tracehook_report_syscall_exit(regs, step); 1531 tracehook_report_syscall_exit(regs, step);
1532
1533 rcu_user_enter();
1529} 1534}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b280908a376e..bca0ab903e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -779,6 +779,8 @@ static void do_signal(struct pt_regs *regs)
779void 779void
780do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 780do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
781{ 781{
782 rcu_user_exit();
783
782#ifdef CONFIG_X86_MCE 784#ifdef CONFIG_X86_MCE
783 /* notify userspace of pending MCEs */ 785 /* notify userspace of pending MCEs */
784 if (thread_info_flags & _TIF_MCE_NOTIFY) 786 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -804,6 +806,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
804#ifdef CONFIG_X86_32 806#ifdef CONFIG_X86_32
805 clear_thread_flag(TIF_IRET); 807 clear_thread_flag(TIF_IRET);
806#endif /* CONFIG_X86_32 */ 808#endif /* CONFIG_X86_32 */
809
810 rcu_user_enter();
807} 811}
808 812
809void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 813void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b481341c9369..378967578f22 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,6 +55,7 @@
55#include <asm/i387.h> 55#include <asm/i387.h>
56#include <asm/fpu-internal.h> 56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58#include <asm/rcu.h>
58 59
59#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
60 61
@@ -180,11 +181,15 @@ vm86_trap:
180#define DO_ERROR(trapnr, signr, str, name) \ 181#define DO_ERROR(trapnr, signr, str, name) \
181dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 182dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
182{ \ 183{ \
183 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 184 exception_enter(regs); \
184 == NOTIFY_STOP) \ 185 if (notify_die(DIE_TRAP, str, regs, error_code, \
186 trapnr, signr) == NOTIFY_STOP) { \
187 exception_exit(regs); \
185 return; \ 188 return; \
189 } \
186 conditional_sti(regs); \ 190 conditional_sti(regs); \
187 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 191 do_trap(trapnr, signr, str, regs, error_code, NULL); \
192 exception_exit(regs); \
188} 193}
189 194
190#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 195#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
@@ -195,11 +200,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
195 info.si_errno = 0; \ 200 info.si_errno = 0; \
196 info.si_code = sicode; \ 201 info.si_code = sicode; \
197 info.si_addr = (void __user *)siaddr; \ 202 info.si_addr = (void __user *)siaddr; \
198 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 203 exception_enter(regs); \
199 == NOTIFY_STOP) \ 204 if (notify_die(DIE_TRAP, str, regs, error_code, \
205 trapnr, signr) == NOTIFY_STOP) { \
206 exception_exit(regs); \
200 return; \ 207 return; \
208 } \
201 conditional_sti(regs); \ 209 conditional_sti(regs); \
202 do_trap(trapnr, signr, str, regs, error_code, &info); \ 210 do_trap(trapnr, signr, str, regs, error_code, &info); \
211 exception_exit(regs); \
203} 212}
204 213
205DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -222,12 +231,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
222/* Runs on IST stack */ 231/* Runs on IST stack */
223dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 232dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
224{ 233{
234 exception_enter(regs);
225 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 235 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
226 X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) 236 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
227 return; 237 preempt_conditional_sti(regs);
228 preempt_conditional_sti(regs); 238 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
229 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 239 preempt_conditional_cli(regs);
230 preempt_conditional_cli(regs); 240 }
241 exception_exit(regs);
231} 242}
232 243
233dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 244dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -235,6 +246,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
235 static const char str[] = "double fault"; 246 static const char str[] = "double fault";
236 struct task_struct *tsk = current; 247 struct task_struct *tsk = current;
237 248
249 exception_enter(regs);
238 /* Return not checked because double check cannot be ignored */ 250 /* Return not checked because double check cannot be ignored */
239 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 251 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
240 252
@@ -255,16 +267,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
255{ 267{
256 struct task_struct *tsk; 268 struct task_struct *tsk;
257 269
270 exception_enter(regs);
258 conditional_sti(regs); 271 conditional_sti(regs);
259 272
260#ifdef CONFIG_X86_32 273#ifdef CONFIG_X86_32
261 if (regs->flags & X86_VM_MASK) 274 if (regs->flags & X86_VM_MASK) {
262 goto gp_in_vm86; 275 local_irq_enable();
276 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
277 goto exit;
278 }
263#endif 279#endif
264 280
265 tsk = current; 281 tsk = current;
266 if (!user_mode(regs)) 282 if (!user_mode(regs)) {
267 goto gp_in_kernel; 283 if (fixup_exception(regs))
284 goto exit;
285
286 tsk->thread.error_code = error_code;
287 tsk->thread.trap_nr = X86_TRAP_GP;
288 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
289 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
290 die("general protection fault", regs, error_code);
291 goto exit;
292 }
268 293
269 tsk->thread.error_code = error_code; 294 tsk->thread.error_code = error_code;
270 tsk->thread.trap_nr = X86_TRAP_GP; 295 tsk->thread.trap_nr = X86_TRAP_GP;
@@ -279,25 +304,8 @@ do_general_protection(struct pt_regs *regs, long error_code)
279 } 304 }
280 305
281 force_sig(SIGSEGV, tsk); 306 force_sig(SIGSEGV, tsk);
282 return; 307exit:
283 308 exception_exit(regs);
284#ifdef CONFIG_X86_32
285gp_in_vm86:
286 local_irq_enable();
287 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
288 return;
289#endif
290
291gp_in_kernel:
292 if (fixup_exception(regs))
293 return;
294
295 tsk->thread.error_code = error_code;
296 tsk->thread.trap_nr = X86_TRAP_GP;
297 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
298 X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
299 return;
300 die("general protection fault", regs, error_code);
301} 309}
302 310
303/* May run on IST stack. */ 311/* May run on IST stack. */
@@ -312,15 +320,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
312 ftrace_int3_handler(regs)) 320 ftrace_int3_handler(regs))
313 return; 321 return;
314#endif 322#endif
323 exception_enter(regs);
315#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 324#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
316 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 325 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
317 SIGTRAP) == NOTIFY_STOP) 326 SIGTRAP) == NOTIFY_STOP)
318 return; 327 goto exit;
319#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 328#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
320 329
321 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 330 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
322 SIGTRAP) == NOTIFY_STOP) 331 SIGTRAP) == NOTIFY_STOP)
323 return; 332 goto exit;
324 333
325 /* 334 /*
326 * Let others (NMI) know that the debug stack is in use 335 * Let others (NMI) know that the debug stack is in use
@@ -331,6 +340,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
331 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); 340 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
332 preempt_conditional_cli(regs); 341 preempt_conditional_cli(regs);
333 debug_stack_usage_dec(); 342 debug_stack_usage_dec();
343exit:
344 exception_exit(regs);
334} 345}
335 346
336#ifdef CONFIG_X86_64 347#ifdef CONFIG_X86_64
@@ -391,6 +402,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
391 unsigned long dr6; 402 unsigned long dr6;
392 int si_code; 403 int si_code;
393 404
405 exception_enter(regs);
406
394 get_debugreg(dr6, 6); 407 get_debugreg(dr6, 6);
395 408
396 /* Filter out all the reserved bits which are preset to 1 */ 409 /* Filter out all the reserved bits which are preset to 1 */
@@ -406,7 +419,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
406 419
407 /* Catch kmemcheck conditions first of all! */ 420 /* Catch kmemcheck conditions first of all! */
408 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 421 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
409 return; 422 goto exit;
410 423
411 /* DR6 may or may not be cleared by the CPU */ 424 /* DR6 may or may not be cleared by the CPU */
412 set_debugreg(0, 6); 425 set_debugreg(0, 6);
@@ -421,7 +434,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
421 434
422 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, 435 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
423 SIGTRAP) == NOTIFY_STOP) 436 SIGTRAP) == NOTIFY_STOP)
424 return; 437 goto exit;
425 438
426 /* 439 /*
427 * Let others (NMI) know that the debug stack is in use 440 * Let others (NMI) know that the debug stack is in use
@@ -437,7 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
437 X86_TRAP_DB); 450 X86_TRAP_DB);
438 preempt_conditional_cli(regs); 451 preempt_conditional_cli(regs);
439 debug_stack_usage_dec(); 452 debug_stack_usage_dec();
440 return; 453 goto exit;
441 } 454 }
442 455
443 /* 456 /*
@@ -458,7 +471,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
458 preempt_conditional_cli(regs); 471 preempt_conditional_cli(regs);
459 debug_stack_usage_dec(); 472 debug_stack_usage_dec();
460 473
461 return; 474exit:
475 exception_exit(regs);
462} 476}
463 477
464/* 478/*
@@ -555,14 +569,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
555#ifdef CONFIG_X86_32 569#ifdef CONFIG_X86_32
556 ignore_fpu_irq = 1; 570 ignore_fpu_irq = 1;
557#endif 571#endif
558 572 exception_enter(regs);
559 math_error(regs, error_code, X86_TRAP_MF); 573 math_error(regs, error_code, X86_TRAP_MF);
574 exception_exit(regs);
560} 575}
561 576
562dotraplinkage void 577dotraplinkage void
563do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 578do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
564{ 579{
580 exception_enter(regs);
565 math_error(regs, error_code, X86_TRAP_XF); 581 math_error(regs, error_code, X86_TRAP_XF);
582 exception_exit(regs);
566} 583}
567 584
568dotraplinkage void 585dotraplinkage void
@@ -629,6 +646,7 @@ EXPORT_SYMBOL_GPL(math_state_restore);
629dotraplinkage void __kprobes 646dotraplinkage void __kprobes
630do_device_not_available(struct pt_regs *regs, long error_code) 647do_device_not_available(struct pt_regs *regs, long error_code)
631{ 648{
649 exception_enter(regs);
632#ifdef CONFIG_MATH_EMULATION 650#ifdef CONFIG_MATH_EMULATION
633 if (read_cr0() & X86_CR0_EM) { 651 if (read_cr0() & X86_CR0_EM) {
634 struct math_emu_info info = { }; 652 struct math_emu_info info = { };
@@ -637,6 +655,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
637 655
638 info.regs = regs; 656 info.regs = regs;
639 math_emulate(&info); 657 math_emulate(&info);
658 exception_exit(regs);
640 return; 659 return;
641 } 660 }
642#endif 661#endif
@@ -644,12 +663,15 @@ do_device_not_available(struct pt_regs *regs, long error_code)
644#ifdef CONFIG_X86_32 663#ifdef CONFIG_X86_32
645 conditional_sti(regs); 664 conditional_sti(regs);
646#endif 665#endif
666 exception_exit(regs);
647} 667}
648 668
649#ifdef CONFIG_X86_32 669#ifdef CONFIG_X86_32
650dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 670dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
651{ 671{
652 siginfo_t info; 672 siginfo_t info;
673
674 exception_enter(regs);
653 local_irq_enable(); 675 local_irq_enable();
654 676
655 info.si_signo = SIGILL; 677 info.si_signo = SIGILL;
@@ -657,10 +679,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
657 info.si_code = ILL_BADSTK; 679 info.si_code = ILL_BADSTK;
658 info.si_addr = NULL; 680 info.si_addr = NULL;
659 if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 681 if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
660 X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) 682 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
661 return; 683 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
662 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 684 &info);
663 &info); 685 }
686 exception_exit(regs);
664} 687}
665#endif 688#endif
666 689
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 76dcd9d8e0bc..7dde46d68a25 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,6 +18,7 @@
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20#include <asm/fixmap.h> /* VSYSCALL_START */ 20#include <asm/fixmap.h> /* VSYSCALL_START */
21#include <asm/rcu.h> /* exception_enter(), ... */
21 22
22/* 23/*
23 * Page fault error code bits: 24 * Page fault error code bits:
@@ -1000,8 +1001,8 @@ static int fault_in_kernel_space(unsigned long address)
1000 * and the problem, and then passes it off to one of the appropriate 1001 * and the problem, and then passes it off to one of the appropriate
1001 * routines. 1002 * routines.
1002 */ 1003 */
1003dotraplinkage void __kprobes 1004static void __kprobes
1004do_page_fault(struct pt_regs *regs, unsigned long error_code) 1005__do_page_fault(struct pt_regs *regs, unsigned long error_code)
1005{ 1006{
1006 struct vm_area_struct *vma; 1007 struct vm_area_struct *vma;
1007 struct task_struct *tsk; 1008 struct task_struct *tsk;
@@ -1209,3 +1210,11 @@ good_area:
1209 1210
1210 up_read(&mm->mmap_sem); 1211 up_read(&mm->mmap_sem);
1211} 1212}
1213
1214dotraplinkage void __kprobes
1215do_page_fault(struct pt_regs *regs, unsigned long error_code)
1216{
1217 exception_enter(regs);
1218 __do_page_fault(regs, error_code);
1219 exception_exit(regs);
1220}
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 2c8d6a3d250a..bc44311aa18c 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -31,6 +31,7 @@
31#include <linux/mqueue.h> 31#include <linux/mqueue.h>
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/rcupdate.h>
34 35
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -110,8 +111,10 @@ void cpu_idle(void)
110 111
111 /* endless idle loop with no priority at all */ 112 /* endless idle loop with no priority at all */
112 while (1) { 113 while (1) {
114 rcu_idle_enter();
113 while (!need_resched()) 115 while (!need_resched())
114 platform_idle(); 116 platform_idle();
117 rcu_idle_exit();
115 schedule_preempt_disabled(); 118 schedule_preempt_disabled();
116 } 119 }
117} 120}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 53589000fd07..8615d7cf7e01 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/smpboot.h>
45 46
46#include "ehca_classes.h" 47#include "ehca_classes.h"
47#include "ehca_irq.h" 48#include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
652 ehca_process_eq((struct ehca_shca*)data, 1); 653 ehca_process_eq((struct ehca_shca*)data, 1);
653} 654}
654 655
655static inline int find_next_online_cpu(struct ehca_comp_pool *pool) 656static int find_next_online_cpu(struct ehca_comp_pool *pool)
656{ 657{
657 int cpu; 658 int cpu;
658 unsigned long flags; 659 unsigned long flags;
@@ -662,17 +663,20 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
662 ehca_dmp(cpu_online_mask, cpumask_size(), ""); 663 ehca_dmp(cpu_online_mask, cpumask_size(), "");
663 664
664 spin_lock_irqsave(&pool->last_cpu_lock, flags); 665 spin_lock_irqsave(&pool->last_cpu_lock, flags);
665 cpu = cpumask_next(pool->last_cpu, cpu_online_mask); 666 do {
666 if (cpu >= nr_cpu_ids) 667 cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
667 cpu = cpumask_first(cpu_online_mask); 668 if (cpu >= nr_cpu_ids)
668 pool->last_cpu = cpu; 669 cpu = cpumask_first(cpu_online_mask);
670 pool->last_cpu = cpu;
671 } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
669 spin_unlock_irqrestore(&pool->last_cpu_lock, flags); 672 spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
670 673
671 return cpu; 674 return cpu;
672} 675}
673 676
674static void __queue_comp_task(struct ehca_cq *__cq, 677static void __queue_comp_task(struct ehca_cq *__cq,
675 struct ehca_cpu_comp_task *cct) 678 struct ehca_cpu_comp_task *cct,
679 struct task_struct *thread)
676{ 680{
677 unsigned long flags; 681 unsigned long flags;
678 682
@@ -683,7 +687,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
683 __cq->nr_callbacks++; 687 __cq->nr_callbacks++;
684 list_add_tail(&__cq->entry, &cct->cq_list); 688 list_add_tail(&__cq->entry, &cct->cq_list);
685 cct->cq_jobs++; 689 cct->cq_jobs++;
686 wake_up(&cct->wait_queue); 690 wake_up_process(thread);
687 } else 691 } else
688 __cq->nr_callbacks++; 692 __cq->nr_callbacks++;
689 693
@@ -695,6 +699,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
695{ 699{
696 int cpu_id; 700 int cpu_id;
697 struct ehca_cpu_comp_task *cct; 701 struct ehca_cpu_comp_task *cct;
702 struct task_struct *thread;
698 int cq_jobs; 703 int cq_jobs;
699 unsigned long flags; 704 unsigned long flags;
700 705
@@ -702,7 +707,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
702 BUG_ON(!cpu_online(cpu_id)); 707 BUG_ON(!cpu_online(cpu_id));
703 708
704 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); 709 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
705 BUG_ON(!cct); 710 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
711 BUG_ON(!cct || !thread);
706 712
707 spin_lock_irqsave(&cct->task_lock, flags); 713 spin_lock_irqsave(&cct->task_lock, flags);
708 cq_jobs = cct->cq_jobs; 714 cq_jobs = cct->cq_jobs;
@@ -710,28 +716,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
710 if (cq_jobs > 0) { 716 if (cq_jobs > 0) {
711 cpu_id = find_next_online_cpu(pool); 717 cpu_id = find_next_online_cpu(pool);
712 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); 718 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
713 BUG_ON(!cct); 719 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
720 BUG_ON(!cct || !thread);
714 } 721 }
715 722 __queue_comp_task(__cq, cct, thread);
716 __queue_comp_task(__cq, cct);
717} 723}
718 724
719static void run_comp_task(struct ehca_cpu_comp_task *cct) 725static void run_comp_task(struct ehca_cpu_comp_task *cct)
720{ 726{
721 struct ehca_cq *cq; 727 struct ehca_cq *cq;
722 unsigned long flags;
723
724 spin_lock_irqsave(&cct->task_lock, flags);
725 728
726 while (!list_empty(&cct->cq_list)) { 729 while (!list_empty(&cct->cq_list)) {
727 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); 730 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
728 spin_unlock_irqrestore(&cct->task_lock, flags); 731 spin_unlock_irq(&cct->task_lock);
729 732
730 comp_event_callback(cq); 733 comp_event_callback(cq);
731 if (atomic_dec_and_test(&cq->nr_events)) 734 if (atomic_dec_and_test(&cq->nr_events))
732 wake_up(&cq->wait_completion); 735 wake_up(&cq->wait_completion);
733 736
734 spin_lock_irqsave(&cct->task_lock, flags); 737 spin_lock_irq(&cct->task_lock);
735 spin_lock(&cq->task_lock); 738 spin_lock(&cq->task_lock);
736 cq->nr_callbacks--; 739 cq->nr_callbacks--;
737 if (!cq->nr_callbacks) { 740 if (!cq->nr_callbacks) {
@@ -740,159 +743,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
740 } 743 }
741 spin_unlock(&cq->task_lock); 744 spin_unlock(&cq->task_lock);
742 } 745 }
743
744 spin_unlock_irqrestore(&cct->task_lock, flags);
745} 746}
746 747
747static int comp_task(void *__cct) 748static void comp_task_park(unsigned int cpu)
748{ 749{
749 struct ehca_cpu_comp_task *cct = __cct; 750 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
750 int cql_empty; 751 struct ehca_cpu_comp_task *target;
751 DECLARE_WAITQUEUE(wait, current); 752 struct task_struct *thread;
752 753 struct ehca_cq *cq, *tmp;
753 set_current_state(TASK_INTERRUPTIBLE); 754 LIST_HEAD(list);
754 while (!kthread_should_stop()) {
755 add_wait_queue(&cct->wait_queue, &wait);
756
757 spin_lock_irq(&cct->task_lock);
758 cql_empty = list_empty(&cct->cq_list);
759 spin_unlock_irq(&cct->task_lock);
760 if (cql_empty)
761 schedule();
762 else
763 __set_current_state(TASK_RUNNING);
764
765 remove_wait_queue(&cct->wait_queue, &wait);
766 755
767 spin_lock_irq(&cct->task_lock); 756 spin_lock_irq(&cct->task_lock);
768 cql_empty = list_empty(&cct->cq_list); 757 cct->cq_jobs = 0;
769 spin_unlock_irq(&cct->task_lock); 758 cct->active = 0;
770 if (!cql_empty) 759 list_splice_init(&cct->cq_list, &list);
771 run_comp_task(__cct); 760 spin_unlock_irq(&cct->task_lock);
772 761
773 set_current_state(TASK_INTERRUPTIBLE); 762 cpu = find_next_online_cpu(pool);
763 target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
764 thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
765 spin_lock_irq(&target->task_lock);
766 list_for_each_entry_safe(cq, tmp, &list, entry) {
767 list_del(&cq->entry);
768 __queue_comp_task(cq, target, thread);
774 } 769 }
775 __set_current_state(TASK_RUNNING); 770 spin_unlock_irq(&target->task_lock);
776
777 return 0;
778}
779
780static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
781 int cpu)
782{
783 struct ehca_cpu_comp_task *cct;
784
785 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
786 spin_lock_init(&cct->task_lock);
787 INIT_LIST_HEAD(&cct->cq_list);
788 init_waitqueue_head(&cct->wait_queue);
789 cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
790 "ehca_comp/%d", cpu);
791
792 return cct->task;
793} 771}
794 772
795static void destroy_comp_task(struct ehca_comp_pool *pool, 773static void comp_task_stop(unsigned int cpu, bool online)
796 int cpu)
797{ 774{
798 struct ehca_cpu_comp_task *cct; 775 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
799 struct task_struct *task;
800 unsigned long flags_cct;
801
802 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
803
804 spin_lock_irqsave(&cct->task_lock, flags_cct);
805 776
806 task = cct->task; 777 spin_lock_irq(&cct->task_lock);
807 cct->task = NULL;
808 cct->cq_jobs = 0; 778 cct->cq_jobs = 0;
809 779 cct->active = 0;
810 spin_unlock_irqrestore(&cct->task_lock, flags_cct); 780 WARN_ON(!list_empty(&cct->cq_list));
811 781 spin_unlock_irq(&cct->task_lock);
812 if (task)
813 kthread_stop(task);
814} 782}
815 783
816static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu) 784static int comp_task_should_run(unsigned int cpu)
817{ 785{
818 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); 786 struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
819 LIST_HEAD(list);
820 struct ehca_cq *cq;
821 unsigned long flags_cct;
822
823 spin_lock_irqsave(&cct->task_lock, flags_cct);
824
825 list_splice_init(&cct->cq_list, &list);
826
827 while (!list_empty(&list)) {
828 cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
829
830 list_del(&cq->entry);
831 __queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
832 }
833
834 spin_unlock_irqrestore(&cct->task_lock, flags_cct);
835 787
788 return cct->cq_jobs;
836} 789}
837 790
838static int __cpuinit comp_pool_callback(struct notifier_block *nfb, 791static void comp_task(unsigned int cpu)
839 unsigned long action,
840 void *hcpu)
841{ 792{
842 unsigned int cpu = (unsigned long)hcpu; 793 struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
843 struct ehca_cpu_comp_task *cct; 794 int cql_empty;
844 795
845 switch (action) { 796 spin_lock_irq(&cct->task_lock);
846 case CPU_UP_PREPARE: 797 cql_empty = list_empty(&cct->cq_list);
847 case CPU_UP_PREPARE_FROZEN: 798 if (!cql_empty) {
848 ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu); 799 __set_current_state(TASK_RUNNING);
849 if (!create_comp_task(pool, cpu)) { 800 run_comp_task(cct);
850 ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
851 return notifier_from_errno(-ENOMEM);
852 }
853 break;
854 case CPU_UP_CANCELED:
855 case CPU_UP_CANCELED_FROZEN:
856 ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
857 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
858 kthread_bind(cct->task, cpumask_any(cpu_online_mask));
859 destroy_comp_task(pool, cpu);
860 break;
861 case CPU_ONLINE:
862 case CPU_ONLINE_FROZEN:
863 ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
864 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
865 kthread_bind(cct->task, cpu);
866 wake_up_process(cct->task);
867 break;
868 case CPU_DOWN_PREPARE:
869 case CPU_DOWN_PREPARE_FROZEN:
870 ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
871 break;
872 case CPU_DOWN_FAILED:
873 case CPU_DOWN_FAILED_FROZEN:
874 ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
875 break;
876 case CPU_DEAD:
877 case CPU_DEAD_FROZEN:
878 ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
879 destroy_comp_task(pool, cpu);
880 take_over_work(pool, cpu);
881 break;
882 } 801 }
883 802 spin_unlock_irq(&cct->task_lock);
884 return NOTIFY_OK;
885} 803}
886 804
887static struct notifier_block comp_pool_callback_nb __cpuinitdata = { 805static struct smp_hotplug_thread comp_pool_threads = {
888 .notifier_call = comp_pool_callback, 806 .thread_should_run = comp_task_should_run,
889 .priority = 0, 807 .thread_fn = comp_task,
808 .thread_comm = "ehca_comp/%u",
809 .cleanup = comp_task_stop,
810 .park = comp_task_park,
890}; 811};
891 812
892int ehca_create_comp_pool(void) 813int ehca_create_comp_pool(void)
893{ 814{
894 int cpu; 815 int cpu, ret = -ENOMEM;
895 struct task_struct *task;
896 816
897 if (!ehca_scaling_code) 817 if (!ehca_scaling_code)
898 return 0; 818 return 0;
@@ -905,38 +825,46 @@ int ehca_create_comp_pool(void)
905 pool->last_cpu = cpumask_any(cpu_online_mask); 825 pool->last_cpu = cpumask_any(cpu_online_mask);
906 826
907 pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); 827 pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
908 if (pool->cpu_comp_tasks == NULL) { 828 if (!pool->cpu_comp_tasks)
909 kfree(pool); 829 goto out_pool;
910 return -EINVAL;
911 }
912 830
913 for_each_online_cpu(cpu) { 831 pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
914 task = create_comp_task(pool, cpu); 832 if (!pool->cpu_comp_threads)
915 if (task) { 833 goto out_tasks;
916 kthread_bind(task, cpu); 834
917 wake_up_process(task); 835 for_each_present_cpu(cpu) {
918 } 836 struct ehca_cpu_comp_task *cct;
837
838 cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
839 spin_lock_init(&cct->task_lock);
840 INIT_LIST_HEAD(&cct->cq_list);
919 } 841 }
920 842
921 register_hotcpu_notifier(&comp_pool_callback_nb); 843 comp_pool_threads.store = pool->cpu_comp_threads;
844 ret = smpboot_register_percpu_thread(&comp_pool_threads);
845 if (ret)
846 goto out_threads;
922 847
923 printk(KERN_INFO "eHCA scaling code enabled\n"); 848 pr_info("eHCA scaling code enabled\n");
849 return ret;
924 850
925 return 0; 851out_threads:
852 free_percpu(pool->cpu_comp_threads);
853out_tasks:
854 free_percpu(pool->cpu_comp_tasks);
855out_pool:
856 kfree(pool);
857 return ret;
926} 858}
927 859
928void ehca_destroy_comp_pool(void) 860void ehca_destroy_comp_pool(void)
929{ 861{
930 int i;
931
932 if (!ehca_scaling_code) 862 if (!ehca_scaling_code)
933 return; 863 return;
934 864
935 unregister_hotcpu_notifier(&comp_pool_callback_nb); 865 smpboot_unregister_percpu_thread(&comp_pool_threads);
936
937 for_each_online_cpu(i)
938 destroy_comp_task(pool, i);
939 866
867 free_percpu(pool->cpu_comp_threads);
940 free_percpu(pool->cpu_comp_tasks); 868 free_percpu(pool->cpu_comp_tasks);
941 kfree(pool); 869 kfree(pool);
942} 870}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
index 3346cb06cea6..5370199f08c7 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
58void ehca_process_eq(struct ehca_shca *shca, int is_irq); 58void ehca_process_eq(struct ehca_shca *shca, int is_irq);
59 59
60struct ehca_cpu_comp_task { 60struct ehca_cpu_comp_task {
61 wait_queue_head_t wait_queue;
62 struct list_head cq_list; 61 struct list_head cq_list;
63 struct task_struct *task;
64 spinlock_t task_lock; 62 spinlock_t task_lock;
65 int cq_jobs; 63 int cq_jobs;
64 int active;
66}; 65};
67 66
68struct ehca_comp_pool { 67struct ehca_comp_pool {
69 struct ehca_cpu_comp_task *cpu_comp_tasks; 68 struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
69 struct task_struct * __percpu *cpu_comp_threads;
70 int last_cpu; 70 int last_cpu;
71 spinlock_t last_cpu_lock; 71 spinlock_t last_cpu_lock;
72}; 72};
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c5f856a040b9..5e4e6170f43a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -430,6 +430,8 @@ enum
430 NR_SOFTIRQS 430 NR_SOFTIRQS
431}; 431};
432 432
433#define SOFTIRQ_STOP_IDLE_MASK (~(1 << RCU_SOFTIRQ))
434
433/* map softirq index to softirq name. update 'softirq_to_name' in 435/* map softirq index to softirq name. update 'softirq_to_name' in
434 * kernel/softirq.c when adding a new softirq. 436 * kernel/softirq.c when adding a new softirq.
435 */ 437 */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 22ccf9dee177..8d816646f766 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
14 kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) 14 kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
15 15
16 16
17struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
18 void *data,
19 unsigned int cpu,
20 const char *namefmt);
21
17/** 22/**
18 * kthread_run - create and wake a thread. 23 * kthread_run - create and wake a thread.
19 * @threadfn: the function to run until signal_pending(current). 24 * @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
34 39
35void kthread_bind(struct task_struct *k, unsigned int cpu); 40void kthread_bind(struct task_struct *k, unsigned int cpu);
36int kthread_stop(struct task_struct *k); 41int kthread_stop(struct task_struct *k);
37int kthread_should_stop(void); 42bool kthread_should_stop(void);
43bool kthread_should_park(void);
38bool kthread_freezable_should_stop(bool *was_frozen); 44bool kthread_freezable_should_stop(bool *was_frozen);
39void *kthread_data(struct task_struct *k); 45void *kthread_data(struct task_struct *k);
46int kthread_park(struct task_struct *k);
47void kthread_unpark(struct task_struct *k);
48void kthread_parkme(void);
40 49
41int kthreadd(void *unused); 50int kthreadd(void *unused);
42extern struct task_struct *kthreadd_task; 51extern struct task_struct *kthreadd_task;
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 115ead2b5155..7c968e4f929e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -191,6 +191,21 @@ extern void rcu_idle_enter(void);
191extern void rcu_idle_exit(void); 191extern void rcu_idle_exit(void);
192extern void rcu_irq_enter(void); 192extern void rcu_irq_enter(void);
193extern void rcu_irq_exit(void); 193extern void rcu_irq_exit(void);
194
195#ifdef CONFIG_RCU_USER_QS
196extern void rcu_user_enter(void);
197extern void rcu_user_exit(void);
198extern void rcu_user_enter_after_irq(void);
199extern void rcu_user_exit_after_irq(void);
200extern void rcu_user_hooks_switch(struct task_struct *prev,
201 struct task_struct *next);
202#else
203static inline void rcu_user_enter(void) { }
204static inline void rcu_user_exit(void) { }
205static inline void rcu_user_enter_after_irq(void) { }
206static inline void rcu_user_exit_after_irq(void) { }
207#endif /* CONFIG_RCU_USER_QS */
208
194extern void exit_rcu(void); 209extern void exit_rcu(void);
195 210
196/** 211/**
@@ -210,14 +225,12 @@ extern void exit_rcu(void);
210 * to nest RCU_NONIDLE() wrappers, but the nesting level is currently 225 * to nest RCU_NONIDLE() wrappers, but the nesting level is currently
211 * quite limited. If deeper nesting is required, it will be necessary 226 * quite limited. If deeper nesting is required, it will be necessary
212 * to adjust DYNTICK_TASK_NESTING_VALUE accordingly. 227 * to adjust DYNTICK_TASK_NESTING_VALUE accordingly.
213 *
214 * This macro may be used from process-level code only.
215 */ 228 */
216#define RCU_NONIDLE(a) \ 229#define RCU_NONIDLE(a) \
217 do { \ 230 do { \
218 rcu_idle_exit(); \ 231 rcu_irq_enter(); \
219 do { a; } while (0); \ 232 do { a; } while (0); \
220 rcu_idle_enter(); \ 233 rcu_irq_exit(); \
221 } while (0) 234 } while (0)
222 235
223/* 236/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23bddac4bad8..335720a1fc33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1885,6 +1885,14 @@ static inline void rcu_copy_process(struct task_struct *p)
1885 1885
1886#endif 1886#endif
1887 1887
1888static inline void rcu_switch(struct task_struct *prev,
1889 struct task_struct *next)
1890{
1891#ifdef CONFIG_RCU_USER_QS
1892 rcu_user_hooks_switch(prev, next);
1893#endif
1894}
1895
1888static inline void tsk_restore_flags(struct task_struct *task, 1896static inline void tsk_restore_flags(struct task_struct *task,
1889 unsigned long orig_flags, unsigned long flags) 1897 unsigned long orig_flags, unsigned long flags)
1890{ 1898{
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
new file mode 100644
index 000000000000..e0106d8581d3
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
1#ifndef _LINUX_SMPBOOT_H
2#define _LINUX_SMPBOOT_H
3
4#include <linux/types.h>
5
6struct task_struct;
7/* Cookie handed to the thread_fn*/
8struct smpboot_thread_data;
9
10/**
11 * struct smp_hotplug_thread - CPU hotplug related thread descriptor
12 * @store: Pointer to per cpu storage for the task pointers
13 * @list: List head for core management
14 * @thread_should_run: Check whether the thread should run or not. Called with
15 * preemption disabled.
16 * @thread_fn: The associated thread function
17 * @setup: Optional setup function, called when the thread gets
18 * operational the first time
19 * @cleanup: Optional cleanup function, called when the thread
20 * should stop (module exit)
21 * @park: Optional park function, called when the thread is
22 * parked (cpu offline)
23 * @unpark: Optional unpark function, called when the thread is
24 * unparked (cpu online)
25 * @thread_comm: The base name of the thread
26 */
27struct smp_hotplug_thread {
28 struct task_struct __percpu **store;
29 struct list_head list;
30 int (*thread_should_run)(unsigned int cpu);
31 void (*thread_fn)(unsigned int cpu);
32 void (*setup)(unsigned int cpu);
33 void (*cleanup)(unsigned int cpu, bool online);
34 void (*park)(unsigned int cpu);
35 void (*unpark)(unsigned int cpu);
36 const char *thread_comm;
37};
38
39int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
40void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
41int smpboot_thread_schedule(void);
42
43#endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 802de56c41e8..2f322c38bd4d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -136,6 +136,22 @@ static inline void tracepoint_synchronize_unregister(void)
136 postrcu; \ 136 postrcu; \
137 } while (0) 137 } while (0)
138 138
139#ifndef MODULE
140#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \
141 static inline void trace_##name##_rcuidle(proto) \
142 { \
143 if (static_key_false(&__tracepoint_##name.key)) \
144 __DO_TRACE(&__tracepoint_##name, \
145 TP_PROTO(data_proto), \
146 TP_ARGS(data_args), \
147 TP_CONDITION(cond), \
148 rcu_idle_exit(), \
149 rcu_idle_enter()); \
150 }
151#else
152#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
153#endif
154
139/* 155/*
140 * Make sure the alignment of the structure in the __tracepoints section will 156 * Make sure the alignment of the structure in the __tracepoints section will
141 * not add unwanted padding between the beginning of the section and the 157 * not add unwanted padding between the beginning of the section and the
@@ -151,16 +167,8 @@ static inline void tracepoint_synchronize_unregister(void)
151 TP_ARGS(data_args), \ 167 TP_ARGS(data_args), \
152 TP_CONDITION(cond),,); \ 168 TP_CONDITION(cond),,); \
153 } \ 169 } \
154 static inline void trace_##name##_rcuidle(proto) \ 170 __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
155 { \ 171 PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \
156 if (static_key_false(&__tracepoint_##name.key)) \
157 __DO_TRACE(&__tracepoint_##name, \
158 TP_PROTO(data_proto), \
159 TP_ARGS(data_args), \
160 TP_CONDITION(cond), \
161 rcu_idle_exit(), \
162 rcu_idle_enter()); \
163 } \
164 static inline int \ 172 static inline int \
165 register_trace_##name(void (*probe)(data_proto), void *data) \ 173 register_trace_##name(void (*probe)(data_proto), void *data) \
166 { \ 174 { \
diff --git a/init/Kconfig b/init/Kconfig
index af6c7f8ba019..c26b8a1d2b57 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -441,6 +441,24 @@ config PREEMPT_RCU
441 This option enables preemptible-RCU code that is common between 441 This option enables preemptible-RCU code that is common between
442 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. 442 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
443 443
444config RCU_USER_QS
445 bool "Consider userspace as in RCU extended quiescent state"
446 depends on HAVE_RCU_USER_QS && SMP
447 help
448 This option sets hooks on kernel / userspace boundaries and
449 puts RCU in extended quiescent state when the CPU runs in
450 userspace. It means that when a CPU runs in userspace, it is
451 excluded from the global RCU state machine and thus doesn't
452 to keep the timer tick on for RCU.
453
454config RCU_USER_QS_FORCE
455 bool "Force userspace extended QS by default"
456 depends on RCU_USER_QS
457 help
458 Set the hooks in user/kernel boundaries by default in order to
459 test this feature that treats userspace as an extended quiescent
460 state until we have a real user like a full adaptive nohz option.
461
444config RCU_FANOUT 462config RCU_FANOUT
445 int "Tree-based hierarchical RCU fanout value" 463 int "Tree-based hierarchical RCU fanout value"
446 range 2 64 if 64BIT 464 range 2 64 if 64BIT
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764c..e5602d32acb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o lglock.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
48obj-$(CONFIG_SMP) += smp.o 48obj-$(CONFIG_SMP) += smp.o
49obj-$(CONFIG_SMP) += smpboot.o
50ifneq ($(CONFIG_SMP),y) 49ifneq ($(CONFIG_SMP),y)
51obj-y += up.o 50obj-y += up.o
52endif 51endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14d32588cccd..e615dfbcf794 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
280 __func__, cpu); 280 __func__, cpu);
281 goto out_release; 281 goto out_release;
282 } 282 }
283 smpboot_park_threads(cpu);
283 284
284 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 285 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
285 if (err) { 286 if (err) {
286 /* CPU didn't die: tell everyone. Can't complain. */ 287 /* CPU didn't die: tell everyone. Can't complain. */
288 smpboot_unpark_threads(cpu);
287 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 289 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
288
289 goto out_release; 290 goto out_release;
290 } 291 }
291 BUG_ON(cpu_online(cpu)); 292 BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
354 goto out; 355 goto out;
355 } 356 }
356 357
358 ret = smpboot_create_threads(cpu);
359 if (ret)
360 goto out;
361
357 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 362 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
358 if (ret) { 363 if (ret) {
359 nr_calls--; 364 nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
368 goto out_notify; 373 goto out_notify;
369 BUG_ON(!cpu_online(cpu)); 374 BUG_ON(!cpu_online(cpu));
370 375
376 /* Wake the per cpu threads */
377 smpboot_unpark_threads(cpu);
378
371 /* Now call notifier in preparation. */ 379 /* Now call notifier in preparation. */
372 cpu_notify(CPU_ONLINE | mod, hcpu); 380 cpu_notify(CPU_ONLINE | mod, hcpu);
373 381
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b579af57ea10..146a6fa96825 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
37}; 37};
38 38
39struct kthread { 39struct kthread {
40 int should_stop; 40 unsigned long flags;
41 unsigned int cpu;
41 void *data; 42 void *data;
43 struct completion parked;
42 struct completion exited; 44 struct completion exited;
43}; 45};
44 46
47enum KTHREAD_BITS {
48 KTHREAD_IS_PER_CPU = 0,
49 KTHREAD_SHOULD_STOP,
50 KTHREAD_SHOULD_PARK,
51 KTHREAD_IS_PARKED,
52};
53
45#define to_kthread(tsk) \ 54#define to_kthread(tsk) \
46 container_of((tsk)->vfork_done, struct kthread, exited) 55 container_of((tsk)->vfork_done, struct kthread, exited)
47 56
@@ -52,13 +61,29 @@ struct kthread {
52 * and this will return true. You should then return, and your return 61 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop(). 62 * value will be passed through to kthread_stop().
54 */ 63 */
55int kthread_should_stop(void) 64bool kthread_should_stop(void)
56{ 65{
57 return to_kthread(current)->should_stop; 66 return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
58} 67}
59EXPORT_SYMBOL(kthread_should_stop); 68EXPORT_SYMBOL(kthread_should_stop);
60 69
61/** 70/**
71 * kthread_should_park - should this kthread park now?
72 *
73 * When someone calls kthread_park() on your kthread, it will be woken
74 * and this will return true. You should then do the necessary
75 * cleanup and call kthread_parkme()
76 *
77 * Similar to kthread_should_stop(), but this keeps the thread alive
78 * and in a park position. kthread_unpark() "restarts" the thread and
79 * calls the thread function again.
80 */
81bool kthread_should_park(void)
82{
83 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
84}
85
86/**
62 * kthread_freezable_should_stop - should this freezable kthread return now? 87 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen 88 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 * 89 *
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
96 return to_kthread(task)->data; 121 return to_kthread(task)->data;
97} 122}
98 123
124static void __kthread_parkme(struct kthread *self)
125{
126 __set_current_state(TASK_INTERRUPTIBLE);
127 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
128 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
129 complete(&self->parked);
130 schedule();
131 __set_current_state(TASK_INTERRUPTIBLE);
132 }
133 clear_bit(KTHREAD_IS_PARKED, &self->flags);
134 __set_current_state(TASK_RUNNING);
135}
136
137void kthread_parkme(void)
138{
139 __kthread_parkme(to_kthread(current));
140}
141
99static int kthread(void *_create) 142static int kthread(void *_create)
100{ 143{
101 /* Copy data: it's on kthread's stack */ 144 /* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
105 struct kthread self; 148 struct kthread self;
106 int ret; 149 int ret;
107 150
108 self.should_stop = 0; 151 self.flags = 0;
109 self.data = data; 152 self.data = data;
110 init_completion(&self.exited); 153 init_completion(&self.exited);
154 init_completion(&self.parked);
111 current->vfork_done = &self.exited; 155 current->vfork_done = &self.exited;
112 156
113 /* OK, tell user we're spawned, wait for stop or wakeup */ 157 /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
117 schedule(); 161 schedule();
118 162
119 ret = -EINTR; 163 ret = -EINTR;
120 if (!self.should_stop)
121 ret = threadfn(data);
122 164
165 if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
166 __kthread_parkme(&self);
167 ret = threadfn(data);
168 }
123 /* we can't just return, we must preserve "self" on stack */ 169 /* we can't just return, we must preserve "self" on stack */
124 do_exit(ret); 170 do_exit(ret);
125} 171}
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
172 * Returns a task_struct or ERR_PTR(-ENOMEM). 218 * Returns a task_struct or ERR_PTR(-ENOMEM).
173 */ 219 */
174struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 220struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
175 void *data, 221 void *data, int node,
176 int node,
177 const char namefmt[], 222 const char namefmt[],
178 ...) 223 ...)
179{ 224{
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
210} 255}
211EXPORT_SYMBOL(kthread_create_on_node); 256EXPORT_SYMBOL(kthread_create_on_node);
212 257
258static void __kthread_bind(struct task_struct *p, unsigned int cpu)
259{
260 /* It's safe because the task is inactive. */
261 do_set_cpus_allowed(p, cpumask_of(cpu));
262 p->flags |= PF_THREAD_BOUND;
263}
264
213/** 265/**
214 * kthread_bind - bind a just-created kthread to a cpu. 266 * kthread_bind - bind a just-created kthread to a cpu.
215 * @p: thread created by kthread_create(). 267 * @p: thread created by kthread_create().
@@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
226 WARN_ON(1); 278 WARN_ON(1);
227 return; 279 return;
228 } 280 }
229 281 __kthread_bind(p, cpu);
230 /* It's safe because the task is inactive. */
231 do_set_cpus_allowed(p, cpumask_of(cpu));
232 p->flags |= PF_THREAD_BOUND;
233} 282}
234EXPORT_SYMBOL(kthread_bind); 283EXPORT_SYMBOL(kthread_bind);
235 284
236/** 285/**
286 * kthread_create_on_cpu - Create a cpu bound kthread
287 * @threadfn: the function to run until signal_pending(current).
288 * @data: data ptr for @threadfn.
289 * @cpu: The cpu on which the thread should be bound,
290 * @namefmt: printf-style name for the thread. Format is restricted
291 * to "name.*%u". Code fills in cpu number.
292 *
293 * Description: This helper function creates and names a kernel thread
294 * The thread will be woken and put into park mode.
295 */
296struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
297 void *data, unsigned int cpu,
298 const char *namefmt)
299{
300 struct task_struct *p;
301
302 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
303 cpu);
304 if (IS_ERR(p))
305 return p;
306 set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
307 to_kthread(p)->cpu = cpu;
308 /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
309 kthread_park(p);
310 return p;
311}
312
313static struct kthread *task_get_live_kthread(struct task_struct *k)
314{
315 struct kthread *kthread;
316
317 get_task_struct(k);
318 kthread = to_kthread(k);
319 /* It might have exited */
320 barrier();
321 if (k->vfork_done != NULL)
322 return kthread;
323 return NULL;
324}
325
326/**
327 * kthread_unpark - unpark a thread created by kthread_create().
328 * @k: thread created by kthread_create().
329 *
330 * Sets kthread_should_park() for @k to return false, wakes it, and
331 * waits for it to return. If the thread is marked percpu then its
332 * bound to the cpu again.
333 */
334void kthread_unpark(struct task_struct *k)
335{
336 struct kthread *kthread = task_get_live_kthread(k);
337
338 if (kthread) {
339 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
340 /*
341 * We clear the IS_PARKED bit here as we don't wait
342 * until the task has left the park code. So if we'd
343 * park before that happens we'd see the IS_PARKED bit
344 * which might be about to be cleared.
345 */
346 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
347 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
348 __kthread_bind(k, kthread->cpu);
349 wake_up_process(k);
350 }
351 }
352 put_task_struct(k);
353}
354
355/**
356 * kthread_park - park a thread created by kthread_create().
357 * @k: thread created by kthread_create().
358 *
359 * Sets kthread_should_park() for @k to return true, wakes it, and
360 * waits for it to return. This can also be called after kthread_create()
361 * instead of calling wake_up_process(): the thread will park without
362 * calling threadfn().
363 *
364 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
365 * If called by the kthread itself just the park bit is set.
366 */
367int kthread_park(struct task_struct *k)
368{
369 struct kthread *kthread = task_get_live_kthread(k);
370 int ret = -ENOSYS;
371
372 if (kthread) {
373 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
374 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
375 if (k != current) {
376 wake_up_process(k);
377 wait_for_completion(&kthread->parked);
378 }
379 }
380 ret = 0;
381 }
382 put_task_struct(k);
383 return ret;
384}
385
386/**
237 * kthread_stop - stop a thread created by kthread_create(). 387 * kthread_stop - stop a thread created by kthread_create().
238 * @k: thread created by kthread_create(). 388 * @k: thread created by kthread_create().
239 * 389 *
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
250 */ 400 */
251int kthread_stop(struct task_struct *k) 401int kthread_stop(struct task_struct *k)
252{ 402{
253 struct kthread *kthread; 403 struct kthread *kthread = task_get_live_kthread(k);
254 int ret; 404 int ret;
255 405
256 trace_sched_kthread_stop(k); 406 trace_sched_kthread_stop(k);
257 get_task_struct(k); 407 if (kthread) {
258 408 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
259 kthread = to_kthread(k); 409 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
260 barrier(); /* it might have exited */
261 if (k->vfork_done != NULL) {
262 kthread->should_stop = 1;
263 wake_up_process(k); 410 wake_up_process(k);
264 wait_for_completion(&kthread->exited); 411 wait_for_completion(&kthread->exited);
265 } 412 }
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4e6a61b15e86..29ca1c6da594 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h>
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/rcu.h> 51#include <trace/events/rcu.h>
@@ -81,6 +82,9 @@ void __rcu_read_unlock(void)
81 } else { 82 } else {
82 barrier(); /* critical section before exit code. */ 83 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN; 84 t->rcu_read_lock_nesting = INT_MIN;
85#ifdef CONFIG_PROVE_RCU_DELAY
86 udelay(10); /* Make preemption more probable. */
87#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
84 barrier(); /* assign before ->rcu_read_unlock_special load */ 88 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 89 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t); 90 rcu_read_unlock_special(t);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 547b1fe5b052..e4c6a598d6f7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head,
56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long newval)
60{ 60{
61 if (rcu_dynticks_nesting) { 61 if (newval) {
62 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting)); 63 rcu_dynticks_nesting, newval));
64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); 67 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 68 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 69 struct task_struct *idle = idle_task(smp_processor_id());
69 70
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 71 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting)); 72 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 73 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 74 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm, 75 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */ 76 idle->pid, idle->comm); /* must be idle task! */
76 } 77 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 78 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
79 barrier();
80 rcu_dynticks_nesting = newval;
78} 81}
79 82
80/* 83/*
@@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval)
84void rcu_idle_enter(void) 87void rcu_idle_enter(void)
85{ 88{
86 unsigned long flags; 89 unsigned long flags;
87 long long oldval; 90 long long newval;
88 91
89 local_irq_save(flags); 92 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); 93 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 94 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE) 95 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0; 96 newval = 0;
95 else 97 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 98 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
97 rcu_idle_enter_common(oldval); 99 rcu_idle_enter_common(newval);
98 local_irq_restore(flags); 100 local_irq_restore(flags);
99} 101}
100EXPORT_SYMBOL_GPL(rcu_idle_enter); 102EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
105void rcu_irq_exit(void) 107void rcu_irq_exit(void)
106{ 108{
107 unsigned long flags; 109 unsigned long flags;
108 long long oldval; 110 long long newval;
109 111
110 local_irq_save(flags); 112 local_irq_save(flags);
111 oldval = rcu_dynticks_nesting; 113 newval = rcu_dynticks_nesting - 1;
112 rcu_dynticks_nesting--; 114 WARN_ON_ONCE(newval < 0);
113 WARN_ON_ONCE(rcu_dynticks_nesting < 0); 115 rcu_idle_enter_common(newval);
114 rcu_idle_enter_common(oldval);
115 local_irq_restore(flags); 116 local_irq_restore(flags);
116} 117}
118EXPORT_SYMBOL_GPL(rcu_irq_exit);
117 119
118/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 120/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
119static void rcu_idle_exit_common(long long oldval) 121static void rcu_idle_exit_common(long long oldval)
@@ -171,6 +173,7 @@ void rcu_irq_enter(void)
171 rcu_idle_exit_common(oldval); 173 rcu_idle_exit_common(oldval);
172 local_irq_restore(flags); 174 local_irq_restore(flags);
173} 175}
176EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 177
175#ifdef CONFIG_DEBUG_LOCK_ALLOC 178#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 179
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 918fd1e8509c..3d0190282204 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -278,7 +278,7 @@ static int rcu_boost(void)
278 rcu_preempt_ctrlblk.exp_tasks == NULL) 278 rcu_preempt_ctrlblk.exp_tasks == NULL)
279 return 0; /* Nothing to boost. */ 279 return 0; /* Nothing to boost. */
280 280
281 raw_local_irq_save(flags); 281 local_irq_save(flags);
282 282
283 /* 283 /*
284 * Recheck with irqs disabled: all tasks in need of boosting 284 * Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +287,7 @@ static int rcu_boost(void)
287 */ 287 */
288 if (rcu_preempt_ctrlblk.boost_tasks == NULL && 288 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
289 rcu_preempt_ctrlblk.exp_tasks == NULL) { 289 rcu_preempt_ctrlblk.exp_tasks == NULL) {
290 raw_local_irq_restore(flags); 290 local_irq_restore(flags);
291 return 0; 291 return 0;
292 } 292 }
293 293
@@ -317,7 +317,7 @@ static int rcu_boost(void)
317 t = container_of(tb, struct task_struct, rcu_node_entry); 317 t = container_of(tb, struct task_struct, rcu_node_entry);
318 rt_mutex_init_proxy_locked(&mtx, t); 318 rt_mutex_init_proxy_locked(&mtx, t);
319 t->rcu_boost_mutex = &mtx; 319 t->rcu_boost_mutex = &mtx;
320 raw_local_irq_restore(flags); 320 local_irq_restore(flags);
321 rt_mutex_lock(&mtx); 321 rt_mutex_lock(&mtx);
322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 322 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
323 323
@@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
991{ 991{
992 unsigned long flags; 992 unsigned long flags;
993 993
994 raw_local_irq_save(flags); 994 local_irq_save(flags);
995 rcp->qlen -= n; 995 rcp->qlen -= n;
996 raw_local_irq_restore(flags); 996 local_irq_restore(flags);
997} 997}
998 998
999/* 999/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 25b15033c61f..aaa7b9f3532a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre
53 53
54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
55static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
56static int stat_interval; /* Interval between stats, in seconds. */ 56static int stat_interval = 60; /* Interval between stats, in seconds. */
57 /* Defaults to "only at end of test". */ 57 /* Zero means "only at end of test". */
58static bool verbose; /* Print more debug info. */ 58static bool verbose; /* Print more debug info. */
59static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 59static bool test_no_idle_hz = true;
60 /* Test RCU support for tickless idle CPUs. */
60static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
61static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
62static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
119 120
120#define TORTURE_FLAG "-torture:" 121#define TORTURE_FLAG "-torture:"
121#define PRINTK_STRING(s) \ 122#define PRINTK_STRING(s) \
122 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 123 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
123#define VERBOSE_PRINTK_STRING(s) \ 124#define VERBOSE_PRINTK_STRING(s) \
124 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) 125 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
125#define VERBOSE_PRINTK_ERRSTRING(s) \ 126#define VERBOSE_PRINTK_ERRSTRING(s) \
126 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) 127 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
127 128
128static char printk_buf[4096]; 129static char printk_buf[4096];
129 130
@@ -176,8 +177,14 @@ static long n_rcu_torture_boosts;
176static long n_rcu_torture_timers; 177static long n_rcu_torture_timers;
177static long n_offline_attempts; 178static long n_offline_attempts;
178static long n_offline_successes; 179static long n_offline_successes;
180static unsigned long sum_offline;
181static int min_offline = -1;
182static int max_offline;
179static long n_online_attempts; 183static long n_online_attempts;
180static long n_online_successes; 184static long n_online_successes;
185static unsigned long sum_online;
186static int min_online = -1;
187static int max_online;
181static long n_barrier_attempts; 188static long n_barrier_attempts;
182static long n_barrier_successes; 189static long n_barrier_successes;
183static struct list_head rcu_torture_removed; 190static struct list_head rcu_torture_removed;
@@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
235 if (fullstop == FULLSTOP_DONTSTOP) 242 if (fullstop == FULLSTOP_DONTSTOP)
236 fullstop = FULLSTOP_SHUTDOWN; 243 fullstop = FULLSTOP_SHUTDOWN;
237 else 244 else
238 printk(KERN_WARNING /* but going down anyway, so... */ 245 pr_warn(/* but going down anyway, so... */
239 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 246 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
240 mutex_unlock(&fullstop_mutex); 247 mutex_unlock(&fullstop_mutex);
241 return NOTIFY_DONE; 248 return NOTIFY_DONE;
@@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
248static void rcutorture_shutdown_absorb(char *title) 255static void rcutorture_shutdown_absorb(char *title)
249{ 256{
250 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 257 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
251 printk(KERN_NOTICE 258 pr_notice(
252 "rcutorture thread %s parking due to system shutdown\n", 259 "rcutorture thread %s parking due to system shutdown\n",
253 title); 260 title);
254 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); 261 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -1214,11 +1221,13 @@ rcu_torture_printk(char *page)
1214 n_rcu_torture_boost_failure, 1221 n_rcu_torture_boost_failure,
1215 n_rcu_torture_boosts, 1222 n_rcu_torture_boosts,
1216 n_rcu_torture_timers); 1223 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", 1224 cnt += sprintf(&page[cnt],
1218 n_online_successes, 1225 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1219 n_online_attempts, 1226 n_online_successes, n_online_attempts,
1220 n_offline_successes, 1227 n_offline_successes, n_offline_attempts,
1221 n_offline_attempts); 1228 min_online, max_online,
1229 min_offline, max_offline,
1230 sum_online, sum_offline, HZ);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", 1231 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1223 n_barrier_successes, 1232 n_barrier_successes,
1224 n_barrier_attempts, 1233 n_barrier_attempts,
@@ -1267,7 +1276,7 @@ rcu_torture_stats_print(void)
1267 int cnt; 1276 int cnt;
1268 1277
1269 cnt = rcu_torture_printk(printk_buf); 1278 cnt = rcu_torture_printk(printk_buf);
1270 printk(KERN_ALERT "%s", printk_buf); 1279 pr_alert("%s", printk_buf);
1271} 1280}
1272 1281
1273/* 1282/*
@@ -1380,20 +1389,20 @@ rcu_torture_stutter(void *arg)
1380static inline void 1389static inline void
1381rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1390rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1382{ 1391{
1383 printk(KERN_ALERT "%s" TORTURE_FLAG 1392 pr_alert("%s" TORTURE_FLAG
1384 "--- %s: nreaders=%d nfakewriters=%d " 1393 "--- %s: nreaders=%d nfakewriters=%d "
1385 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1394 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1386 "shuffle_interval=%d stutter=%d irqreader=%d " 1395 "shuffle_interval=%d stutter=%d irqreader=%d "
1387 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1396 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1388 "test_boost=%d/%d test_boost_interval=%d " 1397 "test_boost=%d/%d test_boost_interval=%d "
1389 "test_boost_duration=%d shutdown_secs=%d " 1398 "test_boost_duration=%d shutdown_secs=%d "
1390 "onoff_interval=%d onoff_holdoff=%d\n", 1399 "onoff_interval=%d onoff_holdoff=%d\n",
1391 torture_type, tag, nrealreaders, nfakewriters, 1400 torture_type, tag, nrealreaders, nfakewriters,
1392 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1401 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1393 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1402 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1394 test_boost, cur_ops->can_boost, 1403 test_boost, cur_ops->can_boost,
1395 test_boost_interval, test_boost_duration, shutdown_secs, 1404 test_boost_interval, test_boost_duration, shutdown_secs,
1396 onoff_interval, onoff_holdoff); 1405 onoff_interval, onoff_holdoff);
1397} 1406}
1398 1407
1399static struct notifier_block rcutorture_shutdown_nb = { 1408static struct notifier_block rcutorture_shutdown_nb = {
@@ -1460,9 +1469,9 @@ rcu_torture_shutdown(void *arg)
1460 !kthread_should_stop()) { 1469 !kthread_should_stop()) {
1461 delta = shutdown_time - jiffies_snap; 1470 delta = shutdown_time - jiffies_snap;
1462 if (verbose) 1471 if (verbose)
1463 printk(KERN_ALERT "%s" TORTURE_FLAG 1472 pr_alert("%s" TORTURE_FLAG
1464 "rcu_torture_shutdown task: %lu jiffies remaining\n", 1473 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1465 torture_type, delta); 1474 torture_type, delta);
1466 schedule_timeout_interruptible(delta); 1475 schedule_timeout_interruptible(delta);
1467 jiffies_snap = ACCESS_ONCE(jiffies); 1476 jiffies_snap = ACCESS_ONCE(jiffies);
1468 } 1477 }
@@ -1490,8 +1499,10 @@ static int __cpuinit
1490rcu_torture_onoff(void *arg) 1499rcu_torture_onoff(void *arg)
1491{ 1500{
1492 int cpu; 1501 int cpu;
1502 unsigned long delta;
1493 int maxcpu = -1; 1503 int maxcpu = -1;
1494 DEFINE_RCU_RANDOM(rand); 1504 DEFINE_RCU_RANDOM(rand);
1505 unsigned long starttime;
1495 1506
1496 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); 1507 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1497 for_each_online_cpu(cpu) 1508 for_each_online_cpu(cpu)
@@ -1506,29 +1517,51 @@ rcu_torture_onoff(void *arg)
1506 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1517 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1507 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1518 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1508 if (verbose) 1519 if (verbose)
1509 printk(KERN_ALERT "%s" TORTURE_FLAG 1520 pr_alert("%s" TORTURE_FLAG
1510 "rcu_torture_onoff task: offlining %d\n", 1521 "rcu_torture_onoff task: offlining %d\n",
1511 torture_type, cpu); 1522 torture_type, cpu);
1523 starttime = jiffies;
1512 n_offline_attempts++; 1524 n_offline_attempts++;
1513 if (cpu_down(cpu) == 0) { 1525 if (cpu_down(cpu) == 0) {
1514 if (verbose) 1526 if (verbose)
1515 printk(KERN_ALERT "%s" TORTURE_FLAG 1527 pr_alert("%s" TORTURE_FLAG
1516 "rcu_torture_onoff task: offlined %d\n", 1528 "rcu_torture_onoff task: offlined %d\n",
1517 torture_type, cpu); 1529 torture_type, cpu);
1518 n_offline_successes++; 1530 n_offline_successes++;
1531 delta = jiffies - starttime;
1532 sum_offline += delta;
1533 if (min_offline < 0) {
1534 min_offline = delta;
1535 max_offline = delta;
1536 }
1537 if (min_offline > delta)
1538 min_offline = delta;
1539 if (max_offline < delta)
1540 max_offline = delta;
1519 } 1541 }
1520 } else if (cpu_is_hotpluggable(cpu)) { 1542 } else if (cpu_is_hotpluggable(cpu)) {
1521 if (verbose) 1543 if (verbose)
1522 printk(KERN_ALERT "%s" TORTURE_FLAG 1544 pr_alert("%s" TORTURE_FLAG
1523 "rcu_torture_onoff task: onlining %d\n", 1545 "rcu_torture_onoff task: onlining %d\n",
1524 torture_type, cpu); 1546 torture_type, cpu);
1547 starttime = jiffies;
1525 n_online_attempts++; 1548 n_online_attempts++;
1526 if (cpu_up(cpu) == 0) { 1549 if (cpu_up(cpu) == 0) {
1527 if (verbose) 1550 if (verbose)
1528 printk(KERN_ALERT "%s" TORTURE_FLAG 1551 pr_alert("%s" TORTURE_FLAG
1529 "rcu_torture_onoff task: onlined %d\n", 1552 "rcu_torture_onoff task: onlined %d\n",
1530 torture_type, cpu); 1553 torture_type, cpu);
1531 n_online_successes++; 1554 n_online_successes++;
1555 delta = jiffies - starttime;
1556 sum_online += delta;
1557 if (min_online < 0) {
1558 min_online = delta;
1559 max_online = delta;
1560 }
1561 if (min_online > delta)
1562 min_online = delta;
1563 if (max_online < delta)
1564 max_online = delta;
1532 } 1565 }
1533 } 1566 }
1534 schedule_timeout_interruptible(onoff_interval * HZ); 1567 schedule_timeout_interruptible(onoff_interval * HZ);
@@ -1593,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args)
1593 if (!kthread_should_stop()) { 1626 if (!kthread_should_stop()) {
1594 stop_at = get_seconds() + stall_cpu; 1627 stop_at = get_seconds() + stall_cpu;
1595 /* RCU CPU stall is expected behavior in following code. */ 1628 /* RCU CPU stall is expected behavior in following code. */
1596 printk(KERN_ALERT "rcu_torture_stall start.\n"); 1629 pr_alert("rcu_torture_stall start.\n");
1597 rcu_read_lock(); 1630 rcu_read_lock();
1598 preempt_disable(); 1631 preempt_disable();
1599 while (ULONG_CMP_LT(get_seconds(), stop_at)) 1632 while (ULONG_CMP_LT(get_seconds(), stop_at))
1600 continue; /* Induce RCU CPU stall warning. */ 1633 continue; /* Induce RCU CPU stall warning. */
1601 preempt_enable(); 1634 preempt_enable();
1602 rcu_read_unlock(); 1635 rcu_read_unlock();
1603 printk(KERN_ALERT "rcu_torture_stall end.\n"); 1636 pr_alert("rcu_torture_stall end.\n");
1604 } 1637 }
1605 rcutorture_shutdown_absorb("rcu_torture_stall"); 1638 rcutorture_shutdown_absorb("rcu_torture_stall");
1606 while (!kthread_should_stop()) 1639 while (!kthread_should_stop())
@@ -1716,12 +1749,12 @@ static int rcu_torture_barrier_init(void)
1716 if (n_barrier_cbs == 0) 1749 if (n_barrier_cbs == 0)
1717 return 0; 1750 return 0;
1718 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { 1751 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1719 printk(KERN_ALERT "%s" TORTURE_FLAG 1752 pr_alert("%s" TORTURE_FLAG
1720 " Call or barrier ops missing for %s,\n", 1753 " Call or barrier ops missing for %s,\n",
1721 torture_type, cur_ops->name); 1754 torture_type, cur_ops->name);
1722 printk(KERN_ALERT "%s" TORTURE_FLAG 1755 pr_alert("%s" TORTURE_FLAG
1723 " RCU barrier testing omitted from run.\n", 1756 " RCU barrier testing omitted from run.\n",
1724 torture_type); 1757 torture_type);
1725 return 0; 1758 return 0;
1726 } 1759 }
1727 atomic_set(&barrier_cbs_count, 0); 1760 atomic_set(&barrier_cbs_count, 0);
@@ -1814,7 +1847,7 @@ rcu_torture_cleanup(void)
1814 mutex_lock(&fullstop_mutex); 1847 mutex_lock(&fullstop_mutex);
1815 rcutorture_record_test_transition(); 1848 rcutorture_record_test_transition();
1816 if (fullstop == FULLSTOP_SHUTDOWN) { 1849 if (fullstop == FULLSTOP_SHUTDOWN) {
1817 printk(KERN_WARNING /* but going down anyway, so... */ 1850 pr_warn(/* but going down anyway, so... */
1818 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1851 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1819 mutex_unlock(&fullstop_mutex); 1852 mutex_unlock(&fullstop_mutex);
1820 schedule_timeout_uninterruptible(10); 1853 schedule_timeout_uninterruptible(10);
@@ -1938,17 +1971,17 @@ rcu_torture_init(void)
1938 break; 1971 break;
1939 } 1972 }
1940 if (i == ARRAY_SIZE(torture_ops)) { 1973 if (i == ARRAY_SIZE(torture_ops)) {
1941 printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", 1974 pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
1942 torture_type); 1975 torture_type);
1943 printk(KERN_ALERT "rcu-torture types:"); 1976 pr_alert("rcu-torture types:");
1944 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1977 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1945 printk(KERN_ALERT " %s", torture_ops[i]->name); 1978 pr_alert(" %s", torture_ops[i]->name);
1946 printk(KERN_ALERT "\n"); 1979 pr_alert("\n");
1947 mutex_unlock(&fullstop_mutex); 1980 mutex_unlock(&fullstop_mutex);
1948 return -EINVAL; 1981 return -EINVAL;
1949 } 1982 }
1950 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1983 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); 1984 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1952 fqs_duration = 0; 1985 fqs_duration = 0;
1953 } 1986 }
1954 if (cur_ops->init) 1987 if (cur_ops->init)
@@ -1996,14 +2029,15 @@ rcu_torture_init(void)
1996 /* Start up the kthreads. */ 2029 /* Start up the kthreads. */
1997 2030
1998 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 2031 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
1999 writer_task = kthread_run(rcu_torture_writer, NULL, 2032 writer_task = kthread_create(rcu_torture_writer, NULL,
2000 "rcu_torture_writer"); 2033 "rcu_torture_writer");
2001 if (IS_ERR(writer_task)) { 2034 if (IS_ERR(writer_task)) {
2002 firsterr = PTR_ERR(writer_task); 2035 firsterr = PTR_ERR(writer_task);
2003 VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); 2036 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
2004 writer_task = NULL; 2037 writer_task = NULL;
2005 goto unwind; 2038 goto unwind;
2006 } 2039 }
2040 wake_up_process(writer_task);
2007 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 2041 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
2008 GFP_KERNEL); 2042 GFP_KERNEL);
2009 if (fakewriter_tasks == NULL) { 2043 if (fakewriter_tasks == NULL) {
@@ -2118,14 +2152,15 @@ rcu_torture_init(void)
2118 } 2152 }
2119 if (shutdown_secs > 0) { 2153 if (shutdown_secs > 0) {
2120 shutdown_time = jiffies + shutdown_secs * HZ; 2154 shutdown_time = jiffies + shutdown_secs * HZ;
2121 shutdown_task = kthread_run(rcu_torture_shutdown, NULL, 2155 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2122 "rcu_torture_shutdown"); 2156 "rcu_torture_shutdown");
2123 if (IS_ERR(shutdown_task)) { 2157 if (IS_ERR(shutdown_task)) {
2124 firsterr = PTR_ERR(shutdown_task); 2158 firsterr = PTR_ERR(shutdown_task);
2125 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); 2159 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2126 shutdown_task = NULL; 2160 shutdown_task = NULL;
2127 goto unwind; 2161 goto unwind;
2128 } 2162 }
2163 wake_up_process(shutdown_task);
2129 } 2164 }
2130 i = rcu_torture_onoff_init(); 2165 i = rcu_torture_onoff_init();
2131 if (i != 0) { 2166 if (i != 0) {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e542e3e9..4fb2376ddf06 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,6 +52,7 @@
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h>
55 56
56#include "rcutree.h" 57#include "rcutree.h"
57#include <trace/events/rcu.h> 58#include <trace/events/rcu.h>
@@ -61,6 +62,7 @@
61/* Data structures. */ 62/* Data structures. */
62 63
63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
64 66
65#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
@@ -72,7 +74,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
73 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
76 .name = #sname, \ 77 .name = #sname, \
77} 78}
78 79
@@ -88,7 +89,7 @@ LIST_HEAD(rcu_struct_flavors);
88 89
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 90/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; 91static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0); 92module_param(rcu_fanout_leaf, int, 0444);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 93int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ 94static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0, 95 NUM_RCU_LVL_0,
@@ -133,13 +134,12 @@ static int rcu_scheduler_fully_active __read_mostly;
133 */ 134 */
134static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); 135static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
135DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 136DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
136DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 137DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
138DEFINE_PER_CPU(char, rcu_cpu_has_work); 138DEFINE_PER_CPU(char, rcu_cpu_has_work);
139 139
140#endif /* #ifdef CONFIG_RCU_BOOST */ 140#endif /* #ifdef CONFIG_RCU_BOOST */
141 141
142static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 142static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
143static void invoke_rcu_core(void); 143static void invoke_rcu_core(void);
144static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 144static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145 145
@@ -175,8 +175,6 @@ void rcu_sched_qs(int cpu)
175{ 175{
176 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 176 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
177 177
178 rdp->passed_quiesce_gpnum = rdp->gpnum;
179 barrier();
180 if (rdp->passed_quiesce == 0) 178 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 179 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
182 rdp->passed_quiesce = 1; 180 rdp->passed_quiesce = 1;
@@ -186,8 +184,6 @@ void rcu_bh_qs(int cpu)
186{ 184{
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 185 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 186
189 rdp->passed_quiesce_gpnum = rdp->gpnum;
190 barrier();
191 if (rdp->passed_quiesce == 0) 187 if (rdp->passed_quiesce == 0)
192 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 188 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
193 rdp->passed_quiesce = 1; 189 rdp->passed_quiesce = 1;
@@ -210,15 +206,18 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
210DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 206DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
211 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 207 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
212 .dynticks = ATOMIC_INIT(1), 208 .dynticks = ATOMIC_INIT(1),
209#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
210 .ignore_user_qs = true,
211#endif
213}; 212};
214 213
215static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 214static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
216static int qhimark = 10000; /* If this many pending, ignore blimit. */ 215static int qhimark = 10000; /* If this many pending, ignore blimit. */
217static int qlowmark = 100; /* Once only this many pending, use blimit. */ 216static int qlowmark = 100; /* Once only this many pending, use blimit. */
218 217
219module_param(blimit, int, 0); 218module_param(blimit, int, 0444);
220module_param(qhimark, int, 0); 219module_param(qhimark, int, 0444);
221module_param(qlowmark, int, 0); 220module_param(qlowmark, int, 0444);
222 221
223int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 222int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
224int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 223int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -226,7 +225,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
226module_param(rcu_cpu_stall_suppress, int, 0644); 225module_param(rcu_cpu_stall_suppress, int, 0644);
227module_param(rcu_cpu_stall_timeout, int, 0644); 226module_param(rcu_cpu_stall_timeout, int, 0644);
228 227
229static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 228static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
229static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
230
231module_param(jiffies_till_first_fqs, ulong, 0644);
232module_param(jiffies_till_next_fqs, ulong, 0644);
233
234static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
235static void force_quiescent_state(struct rcu_state *rsp);
230static int rcu_pending(int cpu); 236static int rcu_pending(int cpu);
231 237
232/* 238/*
@@ -252,7 +258,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
252 */ 258 */
253void rcu_bh_force_quiescent_state(void) 259void rcu_bh_force_quiescent_state(void)
254{ 260{
255 force_quiescent_state(&rcu_bh_state, 0); 261 force_quiescent_state(&rcu_bh_state);
256} 262}
257EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 263EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
258 264
@@ -286,7 +292,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
286 */ 292 */
287void rcu_sched_force_quiescent_state(void) 293void rcu_sched_force_quiescent_state(void)
288{ 294{
289 force_quiescent_state(&rcu_sched_state, 0); 295 force_quiescent_state(&rcu_sched_state);
290} 296}
291EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 297EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
292 298
@@ -305,7 +311,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
305static int 311static int
306cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 312cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
307{ 313{
308 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); 314 return *rdp->nxttail[RCU_DONE_TAIL +
315 ACCESS_ONCE(rsp->completed) != rdp->completed] &&
316 !rcu_gp_in_progress(rsp);
309} 317}
310 318
311/* 319/*
@@ -317,45 +325,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317} 325}
318 326
319/* 327/*
320 * If the specified CPU is offline, tell the caller that it is in 328 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
321 * a quiescent state. Otherwise, whack it with a reschedule IPI.
322 * Grace periods can end up waiting on an offline CPU when that
323 * CPU is in the process of coming online -- it will be added to the
324 * rcu_node bitmasks before it actually makes it online. The same thing
325 * can happen while a CPU is in the process of coming online. Because this
326 * race is quite rare, we check for it after detecting that the grace
327 * period has been delayed rather than checking each and every CPU
328 * each and every time we start a new grace period.
329 */
330static int rcu_implicit_offline_qs(struct rcu_data *rdp)
331{
332 /*
333 * If the CPU is offline for more than a jiffy, it is in a quiescent
334 * state. We can trust its state not to change because interrupts
335 * are disabled. The reason for the jiffy's worth of slack is to
336 * handle CPUs initializing on the way up and finding their way
337 * to the idle loop on the way down.
338 */
339 if (cpu_is_offline(rdp->cpu) &&
340 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
341 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
342 rdp->offline_fqs++;
343 return 1;
344 }
345 return 0;
346}
347
348/*
349 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
350 * 329 *
351 * If the new value of the ->dynticks_nesting counter now is zero, 330 * If the new value of the ->dynticks_nesting counter now is zero,
352 * we really have entered idle, and must do the appropriate accounting. 331 * we really have entered idle, and must do the appropriate accounting.
353 * The caller must have disabled interrupts. 332 * The caller must have disabled interrupts.
354 */ 333 */
355static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) 334static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
335 bool user)
356{ 336{
357 trace_rcu_dyntick("Start", oldval, 0); 337 trace_rcu_dyntick("Start", oldval, 0);
358 if (!is_idle_task(current)) { 338 if (!user && !is_idle_task(current)) {
359 struct task_struct *idle = idle_task(smp_processor_id()); 339 struct task_struct *idle = idle_task(smp_processor_id());
360 340
361 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 341 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
@@ -372,7 +352,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 352 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 353
374 /* 354 /*
375 * The idle task is not permitted to enter the idle loop while 355 * It is illegal to enter an extended quiescent state while
376 * in an RCU read-side critical section. 356 * in an RCU read-side critical section.
377 */ 357 */
378 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), 358 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
@@ -383,6 +363,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
383 "Illegal idle entry in RCU-sched read-side critical section."); 363 "Illegal idle entry in RCU-sched read-side critical section.");
384} 364}
385 365
366/*
367 * Enter an RCU extended quiescent state, which can be either the
368 * idle loop or adaptive-tickless usermode execution.
369 */
370static void rcu_eqs_enter(bool user)
371{
372 long long oldval;
373 struct rcu_dynticks *rdtp;
374
375 rdtp = &__get_cpu_var(rcu_dynticks);
376 oldval = rdtp->dynticks_nesting;
377 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
378 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
379 rdtp->dynticks_nesting = 0;
380 else
381 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
382 rcu_eqs_enter_common(rdtp, oldval, user);
383}
384
386/** 385/**
387 * rcu_idle_enter - inform RCU that current CPU is entering idle 386 * rcu_idle_enter - inform RCU that current CPU is entering idle
388 * 387 *
@@ -398,21 +397,70 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
398void rcu_idle_enter(void) 397void rcu_idle_enter(void)
399{ 398{
400 unsigned long flags; 399 unsigned long flags;
401 long long oldval; 400
401 local_irq_save(flags);
402 rcu_eqs_enter(false);
403 local_irq_restore(flags);
404}
405EXPORT_SYMBOL_GPL(rcu_idle_enter);
406
407#ifdef CONFIG_RCU_USER_QS
408/**
409 * rcu_user_enter - inform RCU that we are resuming userspace.
410 *
411 * Enter RCU idle mode right before resuming userspace. No use of RCU
412 * is permitted between this call and rcu_user_exit(). This way the
413 * CPU doesn't need to maintain the tick for RCU maintenance purposes
414 * when the CPU runs in userspace.
415 */
416void rcu_user_enter(void)
417{
418 unsigned long flags;
402 struct rcu_dynticks *rdtp; 419 struct rcu_dynticks *rdtp;
403 420
421 /*
422 * Some contexts may involve an exception occuring in an irq,
423 * leading to that nesting:
424 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
425 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
426 * helpers are enough to protect RCU uses inside the exception. So
427 * just return immediately if we detect we are in an IRQ.
428 */
429 if (in_interrupt())
430 return;
431
432 WARN_ON_ONCE(!current->mm);
433
404 local_irq_save(flags); 434 local_irq_save(flags);
405 rdtp = &__get_cpu_var(rcu_dynticks); 435 rdtp = &__get_cpu_var(rcu_dynticks);
406 oldval = rdtp->dynticks_nesting; 436 if (!rdtp->ignore_user_qs && !rdtp->in_user) {
407 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 437 rdtp->in_user = true;
408 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 438 rcu_eqs_enter(true);
409 rdtp->dynticks_nesting = 0; 439 }
410 else
411 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
412 rcu_idle_enter_common(rdtp, oldval);
413 local_irq_restore(flags); 440 local_irq_restore(flags);
414} 441}
415EXPORT_SYMBOL_GPL(rcu_idle_enter); 442
443/**
444 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
445 * after the current irq returns.
446 *
447 * This is similar to rcu_user_enter() but in the context of a non-nesting
448 * irq. After this call, RCU enters into idle mode when the interrupt
449 * returns.
450 */
451void rcu_user_enter_after_irq(void)
452{
453 unsigned long flags;
454 struct rcu_dynticks *rdtp;
455
456 local_irq_save(flags);
457 rdtp = &__get_cpu_var(rcu_dynticks);
458 /* Ensure this irq is interrupting a non-idle RCU state. */
459 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
460 rdtp->dynticks_nesting = 1;
461 local_irq_restore(flags);
462}
463#endif /* CONFIG_RCU_USER_QS */
416 464
417/** 465/**
418 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 466 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -444,18 +492,19 @@ void rcu_irq_exit(void)
444 if (rdtp->dynticks_nesting) 492 if (rdtp->dynticks_nesting)
445 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 493 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
446 else 494 else
447 rcu_idle_enter_common(rdtp, oldval); 495 rcu_eqs_enter_common(rdtp, oldval, true);
448 local_irq_restore(flags); 496 local_irq_restore(flags);
449} 497}
450 498
451/* 499/*
452 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle 500 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
453 * 501 *
454 * If the new value of the ->dynticks_nesting counter was previously zero, 502 * If the new value of the ->dynticks_nesting counter was previously zero,
455 * we really have exited idle, and must do the appropriate accounting. 503 * we really have exited idle, and must do the appropriate accounting.
456 * The caller must have disabled interrupts. 504 * The caller must have disabled interrupts.
457 */ 505 */
458static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) 506static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
507 int user)
459{ 508{
460 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 509 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
461 atomic_inc(&rdtp->dynticks); 510 atomic_inc(&rdtp->dynticks);
@@ -464,7 +513,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
464 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 513 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
465 rcu_cleanup_after_idle(smp_processor_id()); 514 rcu_cleanup_after_idle(smp_processor_id());
466 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 515 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
467 if (!is_idle_task(current)) { 516 if (!user && !is_idle_task(current)) {
468 struct task_struct *idle = idle_task(smp_processor_id()); 517 struct task_struct *idle = idle_task(smp_processor_id());
469 518
470 trace_rcu_dyntick("Error on exit: not idle task", 519 trace_rcu_dyntick("Error on exit: not idle task",
@@ -476,6 +525,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
476 } 525 }
477} 526}
478 527
528/*
529 * Exit an RCU extended quiescent state, which can be either the
530 * idle loop or adaptive-tickless usermode execution.
531 */
532static void rcu_eqs_exit(bool user)
533{
534 struct rcu_dynticks *rdtp;
535 long long oldval;
536
537 rdtp = &__get_cpu_var(rcu_dynticks);
538 oldval = rdtp->dynticks_nesting;
539 WARN_ON_ONCE(oldval < 0);
540 if (oldval & DYNTICK_TASK_NEST_MASK)
541 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
542 else
543 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
544 rcu_eqs_exit_common(rdtp, oldval, user);
545}
546
479/** 547/**
480 * rcu_idle_exit - inform RCU that current CPU is leaving idle 548 * rcu_idle_exit - inform RCU that current CPU is leaving idle
481 * 549 *
@@ -490,21 +558,67 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
490void rcu_idle_exit(void) 558void rcu_idle_exit(void)
491{ 559{
492 unsigned long flags; 560 unsigned long flags;
561
562 local_irq_save(flags);
563 rcu_eqs_exit(false);
564 local_irq_restore(flags);
565}
566EXPORT_SYMBOL_GPL(rcu_idle_exit);
567
568#ifdef CONFIG_RCU_USER_QS
569/**
570 * rcu_user_exit - inform RCU that we are exiting userspace.
571 *
572 * Exit RCU idle mode while entering the kernel because it can
573 * run a RCU read side critical section anytime.
574 */
575void rcu_user_exit(void)
576{
577 unsigned long flags;
493 struct rcu_dynticks *rdtp; 578 struct rcu_dynticks *rdtp;
494 long long oldval; 579
580 /*
581 * Some contexts may involve an exception occuring in an irq,
582 * leading to that nesting:
583 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
584 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
585 * helpers are enough to protect RCU uses inside the exception. So
586 * just return immediately if we detect we are in an IRQ.
587 */
588 if (in_interrupt())
589 return;
495 590
496 local_irq_save(flags); 591 local_irq_save(flags);
497 rdtp = &__get_cpu_var(rcu_dynticks); 592 rdtp = &__get_cpu_var(rcu_dynticks);
498 oldval = rdtp->dynticks_nesting; 593 if (rdtp->in_user) {
499 WARN_ON_ONCE(oldval < 0); 594 rdtp->in_user = false;
500 if (oldval & DYNTICK_TASK_NEST_MASK) 595 rcu_eqs_exit(true);
501 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 596 }
502 else
503 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
504 rcu_idle_exit_common(rdtp, oldval);
505 local_irq_restore(flags); 597 local_irq_restore(flags);
506} 598}
507EXPORT_SYMBOL_GPL(rcu_idle_exit); 599
600/**
601 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
602 * idle mode after the current non-nesting irq returns.
603 *
604 * This is similar to rcu_user_exit() but in the context of an irq.
605 * This is called when the irq has interrupted a userspace RCU idle mode
606 * context. When the current non-nesting interrupt returns after this call,
607 * the CPU won't restore the RCU idle mode.
608 */
609void rcu_user_exit_after_irq(void)
610{
611 unsigned long flags;
612 struct rcu_dynticks *rdtp;
613
614 local_irq_save(flags);
615 rdtp = &__get_cpu_var(rcu_dynticks);
616 /* Ensure we are interrupting an RCU idle mode. */
617 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
618 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
619 local_irq_restore(flags);
620}
621#endif /* CONFIG_RCU_USER_QS */
508 622
509/** 623/**
510 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 624 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -539,7 +653,7 @@ void rcu_irq_enter(void)
539 if (oldval) 653 if (oldval)
540 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 654 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
541 else 655 else
542 rcu_idle_exit_common(rdtp, oldval); 656 rcu_eqs_exit_common(rdtp, oldval, true);
543 local_irq_restore(flags); 657 local_irq_restore(flags);
544} 658}
545 659
@@ -603,6 +717,21 @@ int rcu_is_cpu_idle(void)
603} 717}
604EXPORT_SYMBOL(rcu_is_cpu_idle); 718EXPORT_SYMBOL(rcu_is_cpu_idle);
605 719
720#ifdef CONFIG_RCU_USER_QS
721void rcu_user_hooks_switch(struct task_struct *prev,
722 struct task_struct *next)
723{
724 struct rcu_dynticks *rdtp;
725
726 /* Interrupts are disabled in context switch */
727 rdtp = &__get_cpu_var(rcu_dynticks);
728 if (!rdtp->ignore_user_qs) {
729 clear_tsk_thread_flag(prev, TIF_NOHZ);
730 set_tsk_thread_flag(next, TIF_NOHZ);
731 }
732}
733#endif /* #ifdef CONFIG_RCU_USER_QS */
734
606#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 735#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
607 736
608/* 737/*
@@ -673,7 +802,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
673 * Return true if the specified CPU has passed through a quiescent 802 * Return true if the specified CPU has passed through a quiescent
674 * state by virtue of being in or having passed through an dynticks 803 * state by virtue of being in or having passed through an dynticks
675 * idle state since the last call to dyntick_save_progress_counter() 804 * idle state since the last call to dyntick_save_progress_counter()
676 * for this same CPU. 805 * for this same CPU, or by virtue of having been offline.
677 */ 806 */
678static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 807static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
679{ 808{
@@ -697,8 +826,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
697 return 1; 826 return 1;
698 } 827 }
699 828
700 /* Go check for the CPU being offline. */ 829 /*
701 return rcu_implicit_offline_qs(rdp); 830 * Check for the CPU being offline, but only if the grace period
831 * is old enough. We don't need to worry about the CPU changing
832 * state: If we see it offline even once, it has been through a
833 * quiescent state.
834 *
835 * The reason for insisting that the grace period be at least
836 * one jiffy old is that CPUs that are not quite online and that
837 * have just gone offline can still execute RCU read-side critical
838 * sections.
839 */
840 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
841 return 0; /* Grace period is not old enough. */
842 barrier();
843 if (cpu_is_offline(rdp->cpu)) {
844 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
845 rdp->offline_fqs++;
846 return 1;
847 }
848 return 0;
702} 849}
703 850
704static int jiffies_till_stall_check(void) 851static int jiffies_till_stall_check(void)
@@ -755,14 +902,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
755 rcu_for_each_leaf_node(rsp, rnp) { 902 rcu_for_each_leaf_node(rsp, rnp) {
756 raw_spin_lock_irqsave(&rnp->lock, flags); 903 raw_spin_lock_irqsave(&rnp->lock, flags);
757 ndetected += rcu_print_task_stall(rnp); 904 ndetected += rcu_print_task_stall(rnp);
905 if (rnp->qsmask != 0) {
906 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
907 if (rnp->qsmask & (1UL << cpu)) {
908 print_cpu_stall_info(rsp,
909 rnp->grplo + cpu);
910 ndetected++;
911 }
912 }
758 raw_spin_unlock_irqrestore(&rnp->lock, flags); 913 raw_spin_unlock_irqrestore(&rnp->lock, flags);
759 if (rnp->qsmask == 0)
760 continue;
761 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
762 if (rnp->qsmask & (1UL << cpu)) {
763 print_cpu_stall_info(rsp, rnp->grplo + cpu);
764 ndetected++;
765 }
766 } 914 }
767 915
768 /* 916 /*
@@ -782,11 +930,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
782 else if (!trigger_all_cpu_backtrace()) 930 else if (!trigger_all_cpu_backtrace())
783 dump_stack(); 931 dump_stack();
784 932
785 /* If so configured, complain about tasks blocking the grace period. */ 933 /* Complain about tasks blocking the grace period. */
786 934
787 rcu_print_detail_task_stall(rsp); 935 rcu_print_detail_task_stall(rsp);
788 936
789 force_quiescent_state(rsp, 0); /* Kick them all. */ 937 force_quiescent_state(rsp); /* Kick them all. */
790} 938}
791 939
792static void print_cpu_stall(struct rcu_state *rsp) 940static void print_cpu_stall(struct rcu_state *rsp)
@@ -827,7 +975,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
827 j = ACCESS_ONCE(jiffies); 975 j = ACCESS_ONCE(jiffies);
828 js = ACCESS_ONCE(rsp->jiffies_stall); 976 js = ACCESS_ONCE(rsp->jiffies_stall);
829 rnp = rdp->mynode; 977 rnp = rdp->mynode;
830 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 978 if (rcu_gp_in_progress(rsp) &&
979 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
831 980
832 /* We haven't checked in, so go dump stack. */ 981 /* We haven't checked in, so go dump stack. */
833 print_cpu_stall(rsp); 982 print_cpu_stall(rsp);
@@ -889,12 +1038,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
889 */ 1038 */
890 rdp->gpnum = rnp->gpnum; 1039 rdp->gpnum = rnp->gpnum;
891 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1040 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
892 if (rnp->qsmask & rdp->grpmask) { 1041 rdp->passed_quiesce = 0;
893 rdp->qs_pending = 1; 1042 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
894 rdp->passed_quiesce = 0;
895 } else {
896 rdp->qs_pending = 0;
897 }
898 zero_cpu_stall_ticks(rdp); 1043 zero_cpu_stall_ticks(rdp);
899 } 1044 }
900} 1045}
@@ -974,10 +1119,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
974 * our behalf. Catch up with this state to avoid noting 1119 * our behalf. Catch up with this state to avoid noting
975 * spurious new grace periods. If another grace period 1120 * spurious new grace periods. If another grace period
976 * has started, then rnp->gpnum will have advanced, so 1121 * has started, then rnp->gpnum will have advanced, so
977 * we will detect this later on. 1122 * we will detect this later on. Of course, any quiescent
1123 * states we found for the old GP are now invalid.
978 */ 1124 */
979 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) 1125 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
980 rdp->gpnum = rdp->completed; 1126 rdp->gpnum = rdp->completed;
1127 rdp->passed_quiesce = 0;
1128 }
981 1129
982 /* 1130 /*
983 * If RCU does not need a quiescent state from this CPU, 1131 * If RCU does not need a quiescent state from this CPU,
@@ -1021,97 +1169,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1021 /* Prior grace period ended, so advance callbacks for current CPU. */ 1169 /* Prior grace period ended, so advance callbacks for current CPU. */
1022 __rcu_process_gp_end(rsp, rnp, rdp); 1170 __rcu_process_gp_end(rsp, rnp, rdp);
1023 1171
1024 /*
1025 * Because this CPU just now started the new grace period, we know
1026 * that all of its callbacks will be covered by this upcoming grace
1027 * period, even the ones that were registered arbitrarily recently.
1028 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
1029 *
1030 * Other CPUs cannot be sure exactly when the grace period started.
1031 * Therefore, their recently registered callbacks must pass through
1032 * an additional RCU_NEXT_READY stage, so that they will be handled
1033 * by the next RCU grace period.
1034 */
1035 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1036 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
1037
1038 /* Set state so that this CPU will detect the next quiescent state. */ 1172 /* Set state so that this CPU will detect the next quiescent state. */
1039 __note_new_gpnum(rsp, rnp, rdp); 1173 __note_new_gpnum(rsp, rnp, rdp);
1040} 1174}
1041 1175
1042/* 1176/*
1043 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1177 * Initialize a new grace period.
1044 * in preparation for detecting the next grace period. The caller must hold
1045 * the root node's ->lock, which is released before return. Hard irqs must
1046 * be disabled.
1047 *
1048 * Note that it is legal for a dying CPU (which is marked as offline) to
1049 * invoke this function. This can happen when the dying CPU reports its
1050 * quiescent state.
1051 */ 1178 */
1052static void 1179static int rcu_gp_init(struct rcu_state *rsp)
1053rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1054 __releases(rcu_get_root(rsp)->lock)
1055{ 1180{
1056 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1181 struct rcu_data *rdp;
1057 struct rcu_node *rnp = rcu_get_root(rsp); 1182 struct rcu_node *rnp = rcu_get_root(rsp);
1058 1183
1059 if (!rcu_scheduler_fully_active || 1184 raw_spin_lock_irq(&rnp->lock);
1060 !cpu_needs_another_gp(rsp, rdp)) { 1185 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1061 /*
1062 * Either the scheduler hasn't yet spawned the first
1063 * non-idle task or this CPU does not need another
1064 * grace period. Either way, don't start a new grace
1065 * period.
1066 */
1067 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1068 return;
1069 }
1070 1186
1071 if (rsp->fqs_active) { 1187 if (rcu_gp_in_progress(rsp)) {
1072 /* 1188 /* Grace period already in progress, don't start another. */
1073 * This CPU needs a grace period, but force_quiescent_state() 1189 raw_spin_unlock_irq(&rnp->lock);
1074 * is running. Tell it to start one on this CPU's behalf. 1190 return 0;
1075 */
1076 rsp->fqs_need_gp = 1;
1077 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1078 return;
1079 } 1191 }
1080 1192
1081 /* Advance to a new grace period and initialize state. */ 1193 /* Advance to a new grace period and initialize state. */
1082 rsp->gpnum++; 1194 rsp->gpnum++;
1083 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1195 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
1084 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
1085 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
1086 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1087 record_gp_stall_check_time(rsp); 1196 record_gp_stall_check_time(rsp);
1088 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1197 raw_spin_unlock_irq(&rnp->lock);
1089 1198
1090 /* Exclude any concurrent CPU-hotplug operations. */ 1199 /* Exclude any concurrent CPU-hotplug operations. */
1091 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1200 get_online_cpus();
1092 1201
1093 /* 1202 /*
1094 * Set the quiescent-state-needed bits in all the rcu_node 1203 * Set the quiescent-state-needed bits in all the rcu_node
1095 * structures for all currently online CPUs in breadth-first 1204 * structures for all currently online CPUs in breadth-first order,
1096 * order, starting from the root rcu_node structure. This 1205 * starting from the root rcu_node structure, relying on the layout
1097 * operation relies on the layout of the hierarchy within the 1206 * of the tree within the rsp->node[] array. Note that other CPUs
1098 * rsp->node[] array. Note that other CPUs will access only 1207 * will access only the leaves of the hierarchy, thus seeing that no
1099 * the leaves of the hierarchy, which still indicate that no
1100 * grace period is in progress, at least until the corresponding 1208 * grace period is in progress, at least until the corresponding
1101 * leaf node has been initialized. In addition, we have excluded 1209 * leaf node has been initialized. In addition, we have excluded
1102 * CPU-hotplug operations. 1210 * CPU-hotplug operations.
1103 * 1211 *
1104 * Note that the grace period cannot complete until we finish 1212 * The grace period cannot complete until the initialization
1105 * the initialization process, as there will be at least one 1213 * process finishes, because this kthread handles both.
1106 * qsmask bit set in the root node until that time, namely the
1107 * one corresponding to this CPU, due to the fact that we have
1108 * irqs disabled.
1109 */ 1214 */
1110 rcu_for_each_node_breadth_first(rsp, rnp) { 1215 rcu_for_each_node_breadth_first(rsp, rnp) {
1111 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1216 raw_spin_lock_irq(&rnp->lock);
1217 rdp = this_cpu_ptr(rsp->rda);
1112 rcu_preempt_check_blocked_tasks(rnp); 1218 rcu_preempt_check_blocked_tasks(rnp);
1113 rnp->qsmask = rnp->qsmaskinit; 1219 rnp->qsmask = rnp->qsmaskinit;
1114 rnp->gpnum = rsp->gpnum; 1220 rnp->gpnum = rsp->gpnum;
1221 WARN_ON_ONCE(rnp->completed != rsp->completed);
1115 rnp->completed = rsp->completed; 1222 rnp->completed = rsp->completed;
1116 if (rnp == rdp->mynode) 1223 if (rnp == rdp->mynode)
1117 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1224 rcu_start_gp_per_cpu(rsp, rnp, rdp);
@@ -1119,37 +1226,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1119 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1226 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1120 rnp->level, rnp->grplo, 1227 rnp->level, rnp->grplo,
1121 rnp->grphi, rnp->qsmask); 1228 rnp->grphi, rnp->qsmask);
1122 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1229 raw_spin_unlock_irq(&rnp->lock);
1230#ifdef CONFIG_PROVE_RCU_DELAY
1231 if ((random32() % (rcu_num_nodes * 8)) == 0)
1232 schedule_timeout_uninterruptible(2);
1233#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1234 cond_resched();
1123 } 1235 }
1124 1236
1125 rnp = rcu_get_root(rsp); 1237 put_online_cpus();
1126 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1238 return 1;
1127 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
1128 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1129 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1130} 1239}
1131 1240
1132/* 1241/*
1133 * Report a full set of quiescent states to the specified rcu_state 1242 * Do one round of quiescent-state forcing.
1134 * data structure. This involves cleaning up after the prior grace
1135 * period and letting rcu_start_gp() start up the next grace period
1136 * if one is needed. Note that the caller must hold rnp->lock, as
1137 * required by rcu_start_gp(), which will release it.
1138 */ 1243 */
1139static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1244int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1140 __releases(rcu_get_root(rsp)->lock)
1141{ 1245{
1142 unsigned long gp_duration; 1246 int fqs_state = fqs_state_in;
1143 struct rcu_node *rnp = rcu_get_root(rsp); 1247 struct rcu_node *rnp = rcu_get_root(rsp);
1144 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1145 1248
1146 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1249 rsp->n_force_qs++;
1250 if (fqs_state == RCU_SAVE_DYNTICK) {
1251 /* Collect dyntick-idle snapshots. */
1252 force_qs_rnp(rsp, dyntick_save_progress_counter);
1253 fqs_state = RCU_FORCE_QS;
1254 } else {
1255 /* Handle dyntick-idle and offline CPUs. */
1256 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1257 }
1258 /* Clear flag to prevent immediate re-entry. */
1259 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1260 raw_spin_lock_irq(&rnp->lock);
1261 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1262 raw_spin_unlock_irq(&rnp->lock);
1263 }
1264 return fqs_state;
1265}
1147 1266
1148 /* 1267/*
1149 * Ensure that all grace-period and pre-grace-period activity 1268 * Clean up after the old grace period.
1150 * is seen before the assignment to rsp->completed. 1269 */
1151 */ 1270static void rcu_gp_cleanup(struct rcu_state *rsp)
1152 smp_mb(); /* See above block comment. */ 1271{
1272 unsigned long gp_duration;
1273 struct rcu_data *rdp;
1274 struct rcu_node *rnp = rcu_get_root(rsp);
1275
1276 raw_spin_lock_irq(&rnp->lock);
1153 gp_duration = jiffies - rsp->gp_start; 1277 gp_duration = jiffies - rsp->gp_start;
1154 if (gp_duration > rsp->gp_max) 1278 if (gp_duration > rsp->gp_max)
1155 rsp->gp_max = gp_duration; 1279 rsp->gp_max = gp_duration;
@@ -1161,35 +1285,149 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1161 * they can do to advance the grace period. It is therefore 1285 * they can do to advance the grace period. It is therefore
1162 * safe for us to drop the lock in order to mark the grace 1286 * safe for us to drop the lock in order to mark the grace
1163 * period as completed in all of the rcu_node structures. 1287 * period as completed in all of the rcu_node structures.
1164 *
1165 * But if this CPU needs another grace period, it will take
1166 * care of this while initializing the next grace period.
1167 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
1168 * because the callbacks have not yet been advanced: Those
1169 * callbacks are waiting on the grace period that just now
1170 * completed.
1171 */ 1288 */
1172 if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { 1289 raw_spin_unlock_irq(&rnp->lock);
1173 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1174 1290
1175 /* 1291 /*
1176 * Propagate new ->completed value to rcu_node structures 1292 * Propagate new ->completed value to rcu_node structures so
1177 * so that other CPUs don't have to wait until the start 1293 * that other CPUs don't have to wait until the start of the next
1178 * of the next grace period to process their callbacks. 1294 * grace period to process their callbacks. This also avoids
1179 */ 1295 * some nasty RCU grace-period initialization races by forcing
1180 rcu_for_each_node_breadth_first(rsp, rnp) { 1296 * the end of the current grace period to be completely recorded in
1181 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1297 * all of the rcu_node structures before the beginning of the next
1182 rnp->completed = rsp->gpnum; 1298 * grace period is recorded in any of the rcu_node structures.
1183 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1299 */
1184 } 1300 rcu_for_each_node_breadth_first(rsp, rnp) {
1185 rnp = rcu_get_root(rsp); 1301 raw_spin_lock_irq(&rnp->lock);
1186 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1302 rnp->completed = rsp->gpnum;
1303 raw_spin_unlock_irq(&rnp->lock);
1304 cond_resched();
1187 } 1305 }
1306 rnp = rcu_get_root(rsp);
1307 raw_spin_lock_irq(&rnp->lock);
1188 1308
1189 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1309 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1190 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1310 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1191 rsp->fqs_state = RCU_GP_IDLE; 1311 rsp->fqs_state = RCU_GP_IDLE;
1192 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1312 rdp = this_cpu_ptr(rsp->rda);
1313 if (cpu_needs_another_gp(rsp, rdp))
1314 rsp->gp_flags = 1;
1315 raw_spin_unlock_irq(&rnp->lock);
1316}
1317
1318/*
1319 * Body of kthread that handles grace periods.
1320 */
1321static int __noreturn rcu_gp_kthread(void *arg)
1322{
1323 int fqs_state;
1324 unsigned long j;
1325 int ret;
1326 struct rcu_state *rsp = arg;
1327 struct rcu_node *rnp = rcu_get_root(rsp);
1328
1329 for (;;) {
1330
1331 /* Handle grace-period start. */
1332 for (;;) {
1333 wait_event_interruptible(rsp->gp_wq,
1334 rsp->gp_flags &
1335 RCU_GP_FLAG_INIT);
1336 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
1337 rcu_gp_init(rsp))
1338 break;
1339 cond_resched();
1340 flush_signals(current);
1341 }
1342
1343 /* Handle quiescent-state forcing. */
1344 fqs_state = RCU_SAVE_DYNTICK;
1345 j = jiffies_till_first_fqs;
1346 if (j > HZ) {
1347 j = HZ;
1348 jiffies_till_first_fqs = HZ;
1349 }
1350 for (;;) {
1351 rsp->jiffies_force_qs = jiffies + j;
1352 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1353 (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
1354 (!ACCESS_ONCE(rnp->qsmask) &&
1355 !rcu_preempt_blocked_readers_cgp(rnp)),
1356 j);
1357 /* If grace period done, leave loop. */
1358 if (!ACCESS_ONCE(rnp->qsmask) &&
1359 !rcu_preempt_blocked_readers_cgp(rnp))
1360 break;
1361 /* If time for quiescent-state forcing, do it. */
1362 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
1363 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1364 cond_resched();
1365 } else {
1366 /* Deal with stray signal. */
1367 cond_resched();
1368 flush_signals(current);
1369 }
1370 j = jiffies_till_next_fqs;
1371 if (j > HZ) {
1372 j = HZ;
1373 jiffies_till_next_fqs = HZ;
1374 } else if (j < 1) {
1375 j = 1;
1376 jiffies_till_next_fqs = 1;
1377 }
1378 }
1379
1380 /* Handle grace-period end. */
1381 rcu_gp_cleanup(rsp);
1382 }
1383}
1384
1385/*
1386 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1387 * in preparation for detecting the next grace period. The caller must hold
1388 * the root node's ->lock, which is released before return. Hard irqs must
1389 * be disabled.
1390 *
1391 * Note that it is legal for a dying CPU (which is marked as offline) to
1392 * invoke this function. This can happen when the dying CPU reports its
1393 * quiescent state.
1394 */
1395static void
1396rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1397 __releases(rcu_get_root(rsp)->lock)
1398{
1399 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1400 struct rcu_node *rnp = rcu_get_root(rsp);
1401
1402 if (!rsp->gp_kthread ||
1403 !cpu_needs_another_gp(rsp, rdp)) {
1404 /*
1405 * Either we have not yet spawned the grace-period
1406 * task or this CPU does not need another grace period.
1407 * Either way, don't start a new grace period.
1408 */
1409 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1410 return;
1411 }
1412
1413 rsp->gp_flags = RCU_GP_FLAG_INIT;
1414 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1415 wake_up(&rsp->gp_wq);
1416}
1417
1418/*
1419 * Report a full set of quiescent states to the specified rcu_state
1420 * data structure. This involves cleaning up after the prior grace
1421 * period and letting rcu_start_gp() start up the next grace period
1422 * if one is needed. Note that the caller must hold rnp->lock, as
1423 * required by rcu_start_gp(), which will release it.
1424 */
1425static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1426 __releases(rcu_get_root(rsp)->lock)
1427{
1428 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1429 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1430 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1193} 1431}
1194 1432
1195/* 1433/*
@@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1258 * based on quiescent states detected in an earlier grace period! 1496 * based on quiescent states detected in an earlier grace period!
1259 */ 1497 */
1260static void 1498static void
1261rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) 1499rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1262{ 1500{
1263 unsigned long flags; 1501 unsigned long flags;
1264 unsigned long mask; 1502 unsigned long mask;
@@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
1266 1504
1267 rnp = rdp->mynode; 1505 rnp = rdp->mynode;
1268 raw_spin_lock_irqsave(&rnp->lock, flags); 1506 raw_spin_lock_irqsave(&rnp->lock, flags);
1269 if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { 1507 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1508 rnp->completed == rnp->gpnum) {
1270 1509
1271 /* 1510 /*
1272 * The grace period in which this quiescent state was 1511 * The grace period in which this quiescent state was
@@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1325 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 1564 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1326 * judge of that). 1565 * judge of that).
1327 */ 1566 */
1328 rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); 1567 rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
1329} 1568}
1330 1569
1331#ifdef CONFIG_HOTPLUG_CPU 1570#ifdef CONFIG_HOTPLUG_CPU
@@ -1390,17 +1629,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1390 int i; 1629 int i;
1391 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1630 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1392 1631
1393 /*
1394 * If there is an rcu_barrier() operation in progress, then
1395 * only the task doing that operation is permitted to adopt
1396 * callbacks. To do otherwise breaks rcu_barrier() and friends
1397 * by causing them to fail to wait for the callbacks in the
1398 * orphanage.
1399 */
1400 if (rsp->rcu_barrier_in_progress &&
1401 rsp->rcu_barrier_in_progress != current)
1402 return;
1403
1404 /* Do the accounting first. */ 1632 /* Do the accounting first. */
1405 rdp->qlen_lazy += rsp->qlen_lazy; 1633 rdp->qlen_lazy += rsp->qlen_lazy;
1406 rdp->qlen += rsp->qlen; 1634 rdp->qlen += rsp->qlen;
@@ -1455,9 +1683,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1455 * The CPU has been completely removed, and some other CPU is reporting 1683 * The CPU has been completely removed, and some other CPU is reporting
1456 * this fact from process context. Do the remainder of the cleanup, 1684 * this fact from process context. Do the remainder of the cleanup,
1457 * including orphaning the outgoing CPU's RCU callbacks, and also 1685 * including orphaning the outgoing CPU's RCU callbacks, and also
1458 * adopting them, if there is no _rcu_barrier() instance running. 1686 * adopting them. There can only be one CPU hotplug operation at a time,
1459 * There can only be one CPU hotplug operation at a time, so no other 1687 * so no other CPU can be attempting to update rcu_cpu_kthread_task.
1460 * CPU can be attempting to update rcu_cpu_kthread_task.
1461 */ 1688 */
1462static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 1689static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1463{ 1690{
@@ -1468,8 +1695,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1468 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 1695 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1469 1696
1470 /* Adjust any no-longer-needed kthreads. */ 1697 /* Adjust any no-longer-needed kthreads. */
1471 rcu_stop_cpu_kthread(cpu); 1698 rcu_boost_kthread_setaffinity(rnp, -1);
1472 rcu_node_kthread_setaffinity(rnp, -1);
1473 1699
1474 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1700 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1475 1701
@@ -1515,14 +1741,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 1741 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 1742 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist); 1743 cpu, rdp->qlen, rdp->nxtlist);
1744 init_callback_list(rdp);
1745 /* Disallow further callbacks on this CPU. */
1746 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1518} 1747}
1519 1748
1520#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1749#else /* #ifdef CONFIG_HOTPLUG_CPU */
1521 1750
1522static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1523{
1524}
1525
1526static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1751static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1527{ 1752{
1528} 1753}
@@ -1687,6 +1912,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1687 struct rcu_node *rnp; 1912 struct rcu_node *rnp;
1688 1913
1689 rcu_for_each_leaf_node(rsp, rnp) { 1914 rcu_for_each_leaf_node(rsp, rnp) {
1915 cond_resched();
1690 mask = 0; 1916 mask = 0;
1691 raw_spin_lock_irqsave(&rnp->lock, flags); 1917 raw_spin_lock_irqsave(&rnp->lock, flags);
1692 if (!rcu_gp_in_progress(rsp)) { 1918 if (!rcu_gp_in_progress(rsp)) {
@@ -1723,72 +1949,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1723 * Force quiescent states on reluctant CPUs, and also detect which 1949 * Force quiescent states on reluctant CPUs, and also detect which
1724 * CPUs are in dyntick-idle mode. 1950 * CPUs are in dyntick-idle mode.
1725 */ 1951 */
1726static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1952static void force_quiescent_state(struct rcu_state *rsp)
1727{ 1953{
1728 unsigned long flags; 1954 unsigned long flags;
1729 struct rcu_node *rnp = rcu_get_root(rsp); 1955 bool ret;
1730 1956 struct rcu_node *rnp;
1731 trace_rcu_utilization("Start fqs"); 1957 struct rcu_node *rnp_old = NULL;
1732 if (!rcu_gp_in_progress(rsp)) { 1958
1733 trace_rcu_utilization("End fqs"); 1959 /* Funnel through hierarchy to reduce memory contention. */
1734 return; /* No grace period in progress, nothing to force. */ 1960 rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
1735 } 1961 for (; rnp != NULL; rnp = rnp->parent) {
1736 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { 1962 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
1737 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1963 !raw_spin_trylock(&rnp->fqslock);
1738 trace_rcu_utilization("End fqs"); 1964 if (rnp_old != NULL)
1739 return; /* Someone else is already on the job. */ 1965 raw_spin_unlock(&rnp_old->fqslock);
1740 } 1966 if (ret) {
1741 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) 1967 rsp->n_force_qs_lh++;
1742 goto unlock_fqs_ret; /* no emergency and done recently. */ 1968 return;
1743 rsp->n_force_qs++; 1969 }
1744 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1970 rnp_old = rnp;
1745 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1746 if(!rcu_gp_in_progress(rsp)) {
1747 rsp->n_force_qs_ngp++;
1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1749 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1750 }
1751 rsp->fqs_active = 1;
1752 switch (rsp->fqs_state) {
1753 case RCU_GP_IDLE:
1754 case RCU_GP_INIT:
1755
1756 break; /* grace period idle or initializing, ignore. */
1757
1758 case RCU_SAVE_DYNTICK:
1759
1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1761
1762 /* Record dyntick-idle state. */
1763 force_qs_rnp(rsp, dyntick_save_progress_counter);
1764 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1765 if (rcu_gp_in_progress(rsp))
1766 rsp->fqs_state = RCU_FORCE_QS;
1767 break;
1768
1769 case RCU_FORCE_QS:
1770
1771 /* Check dyntick-idle state, send IPI to laggarts. */
1772 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1773 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1774
1775 /* Leave state in case more forcing is required. */
1776
1777 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1778 break;
1779 } 1971 }
1780 rsp->fqs_active = 0; 1972 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
1781 if (rsp->fqs_need_gp) { 1973
1782 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ 1974 /* Reached the root of the rcu_node tree, acquire lock. */
1783 rsp->fqs_need_gp = 0; 1975 raw_spin_lock_irqsave(&rnp_old->lock, flags);
1784 rcu_start_gp(rsp, flags); /* releases rnp->lock */ 1976 raw_spin_unlock(&rnp_old->fqslock);
1785 trace_rcu_utilization("End fqs"); 1977 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1786 return; 1978 rsp->n_force_qs_lh++;
1979 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1980 return; /* Someone beat us to it. */
1787 } 1981 }
1788 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1982 rsp->gp_flags |= RCU_GP_FLAG_FQS;
1789unlock_fqs_ret: 1983 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
1790 raw_spin_unlock_irqrestore(&rsp->fqslock, flags); 1984 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1791 trace_rcu_utilization("End fqs");
1792} 1985}
1793 1986
1794/* 1987/*
@@ -1805,13 +1998,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
1805 WARN_ON_ONCE(rdp->beenonline == 0); 1998 WARN_ON_ONCE(rdp->beenonline == 0);
1806 1999
1807 /* 2000 /*
1808 * If an RCU GP has gone long enough, go check for dyntick
1809 * idle CPUs and, if needed, send resched IPIs.
1810 */
1811 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1812 force_quiescent_state(rsp, 1);
1813
1814 /*
1815 * Advance callbacks in response to end of earlier grace 2001 * Advance callbacks in response to end of earlier grace
1816 * period that some other CPU ended. 2002 * period that some other CPU ended.
1817 */ 2003 */
@@ -1838,6 +2024,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1838{ 2024{
1839 struct rcu_state *rsp; 2025 struct rcu_state *rsp;
1840 2026
2027 if (cpu_is_offline(smp_processor_id()))
2028 return;
1841 trace_rcu_utilization("Start RCU core"); 2029 trace_rcu_utilization("Start RCU core");
1842 for_each_rcu_flavor(rsp) 2030 for_each_rcu_flavor(rsp)
1843 __rcu_process_callbacks(rsp); 2031 __rcu_process_callbacks(rsp);
@@ -1909,12 +2097,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1909 rdp->blimit = LONG_MAX; 2097 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap && 2098 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head) 2099 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0); 2100 force_quiescent_state(rsp);
1913 rdp->n_force_qs_snap = rsp->n_force_qs; 2101 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen; 2102 rdp->qlen_last_fqs_check = rdp->qlen;
1915 } 2103 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) 2104 }
1917 force_quiescent_state(rsp, 1);
1918} 2105}
1919 2106
1920static void 2107static void
@@ -1929,8 +2116,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1929 head->func = func; 2116 head->func = func;
1930 head->next = NULL; 2117 head->next = NULL;
1931 2118
1932 smp_mb(); /* Ensure RCU update seen before callback registry. */
1933
1934 /* 2119 /*
1935 * Opportunistically note grace-period endings and beginnings. 2120 * Opportunistically note grace-period endings and beginnings.
1936 * Note that we might see a beginning right after we see an 2121 * Note that we might see a beginning right after we see an
@@ -1941,6 +2126,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1941 rdp = this_cpu_ptr(rsp->rda); 2126 rdp = this_cpu_ptr(rsp->rda);
1942 2127
1943 /* Add the callback to our list. */ 2128 /* Add the callback to our list. */
2129 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) {
2130 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2131 WARN_ON_ONCE(1);
2132 local_irq_restore(flags);
2133 return;
2134 }
1944 ACCESS_ONCE(rdp->qlen)++; 2135 ACCESS_ONCE(rdp->qlen)++;
1945 if (lazy) 2136 if (lazy)
1946 rdp->qlen_lazy++; 2137 rdp->qlen_lazy++;
@@ -2195,17 +2386,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2195 /* Is the RCU core waiting for a quiescent state from this CPU? */ 2386 /* Is the RCU core waiting for a quiescent state from this CPU? */
2196 if (rcu_scheduler_fully_active && 2387 if (rcu_scheduler_fully_active &&
2197 rdp->qs_pending && !rdp->passed_quiesce) { 2388 rdp->qs_pending && !rdp->passed_quiesce) {
2198
2199 /*
2200 * If force_quiescent_state() coming soon and this CPU
2201 * needs a quiescent state, and this is either RCU-sched
2202 * or RCU-bh, force a local reschedule.
2203 */
2204 rdp->n_rp_qs_pending++; 2389 rdp->n_rp_qs_pending++;
2205 if (!rdp->preemptible &&
2206 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
2207 jiffies))
2208 set_need_resched();
2209 } else if (rdp->qs_pending && rdp->passed_quiesce) { 2390 } else if (rdp->qs_pending && rdp->passed_quiesce) {
2210 rdp->n_rp_report_qs++; 2391 rdp->n_rp_report_qs++;
2211 return 1; 2392 return 1;
@@ -2235,13 +2416,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2235 return 1; 2416 return 1;
2236 } 2417 }
2237 2418
2238 /* Has an RCU GP gone long enough to send resched IPIs &c? */
2239 if (rcu_gp_in_progress(rsp) &&
2240 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
2241 rdp->n_rp_need_fqs++;
2242 return 1;
2243 }
2244
2245 /* nothing to do */ 2419 /* nothing to do */
2246 rdp->n_rp_need_nothing++; 2420 rdp->n_rp_need_nothing++;
2247 return 0; 2421 return 0;
@@ -2326,13 +2500,10 @@ static void rcu_barrier_func(void *type)
2326static void _rcu_barrier(struct rcu_state *rsp) 2500static void _rcu_barrier(struct rcu_state *rsp)
2327{ 2501{
2328 int cpu; 2502 int cpu;
2329 unsigned long flags;
2330 struct rcu_data *rdp; 2503 struct rcu_data *rdp;
2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); 2504 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done; 2505 unsigned long snap_done;
2334 2506
2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap); 2507 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2337 2508
2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2509 /* Take mutex to serialize concurrent rcu_barrier() requests. */
@@ -2372,70 +2543,30 @@ static void _rcu_barrier(struct rcu_state *rsp)
2372 /* 2543 /*
2373 * Initialize the count to one rather than to zero in order to 2544 * Initialize the count to one rather than to zero in order to
2374 * avoid a too-soon return to zero in case of a short grace period 2545 * avoid a too-soon return to zero in case of a short grace period
2375 * (or preemption of this task). Also flag this task as doing 2546 * (or preemption of this task). Exclude CPU-hotplug operations
2376 * an rcu_barrier(). This will prevent anyone else from adopting 2547 * to ensure that no offline CPU has callbacks queued.
2377 * orphaned callbacks, which could cause otherwise failure if a
2378 * CPU went offline and quickly came back online. To see this,
2379 * consider the following sequence of events:
2380 *
2381 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2382 * 2. CPU 1 goes offline, orphaning its callbacks.
2383 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2384 * 4. CPU 1 comes back online.
2385 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2388 */ 2548 */
2389 init_completion(&rsp->barrier_completion); 2549 init_completion(&rsp->barrier_completion);
2390 atomic_set(&rsp->barrier_cpu_count, 1); 2550 atomic_set(&rsp->barrier_cpu_count, 1);
2391 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2551 get_online_cpus();
2392 rsp->rcu_barrier_in_progress = current;
2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2394 2552
2395 /* 2553 /*
2396 * Force every CPU with callbacks to register a new callback 2554 * Force each CPU with callbacks to register a new callback.
2397 * that will tell us when all the preceding callbacks have 2555 * When that callback is invoked, we will know that all of the
2398 * been invoked. If an offline CPU has callbacks, wait for 2556 * corresponding CPU's preceding callbacks have been invoked.
2399 * it to either come back online or to finish orphaning those
2400 * callbacks.
2401 */ 2557 */
2402 for_each_possible_cpu(cpu) { 2558 for_each_online_cpu(cpu) {
2403 preempt_disable();
2404 rdp = per_cpu_ptr(rsp->rda, cpu); 2559 rdp = per_cpu_ptr(rsp->rda, cpu);
2405 if (cpu_is_offline(cpu)) { 2560 if (ACCESS_ONCE(rdp->qlen)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu,
2407 rsp->n_barrier_done);
2408 preempt_enable();
2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2410 schedule_timeout_interruptible(1);
2411 } else if (ACCESS_ONCE(rdp->qlen)) {
2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 2561 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2413 rsp->n_barrier_done); 2562 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 2563 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2415 preempt_enable();
2416 } else { 2564 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 2565 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done); 2566 rsp->n_barrier_done);
2419 preempt_enable();
2420 } 2567 }
2421 } 2568 }
2422 2569 put_online_cpus();
2423 /*
2424 * Now that all online CPUs have rcu_barrier_callback() callbacks
2425 * posted, we can adopt all of the orphaned callbacks and place
2426 * an rcu_barrier_callback() callback after them. When that is done,
2427 * we are guaranteed to have an rcu_barrier_callback() callback
2428 * following every callback that could possibly have been
2429 * registered before _rcu_barrier() was called.
2430 */
2431 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2432 rcu_adopt_orphan_cbs(rsp);
2433 rsp->rcu_barrier_in_progress = NULL;
2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2435 atomic_inc(&rsp->barrier_cpu_count);
2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2439 2570
2440 /* 2571 /*
2441 * Now that we have an rcu_barrier_callback() callback on each 2572 * Now that we have an rcu_barrier_callback() callback on each
@@ -2456,8 +2587,6 @@ static void _rcu_barrier(struct rcu_state *rsp)
2456 2587
2457 /* Other rcu_barrier() invocations can now safely proceed. */ 2588 /* Other rcu_barrier() invocations can now safely proceed. */
2458 mutex_unlock(&rsp->barrier_mutex); 2589 mutex_unlock(&rsp->barrier_mutex);
2459
2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2461} 2590}
2462 2591
2463/** 2592/**
@@ -2497,6 +2626,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2497 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2626 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2498 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2627 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2499 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2628 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2629#ifdef CONFIG_RCU_USER_QS
2630 WARN_ON_ONCE(rdp->dynticks->in_user);
2631#endif
2500 rdp->cpu = cpu; 2632 rdp->cpu = cpu;
2501 rdp->rsp = rsp; 2633 rdp->rsp = rsp;
2502 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2634 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -2523,6 +2655,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2523 rdp->qlen_last_fqs_check = 0; 2655 rdp->qlen_last_fqs_check = 0;
2524 rdp->n_force_qs_snap = rsp->n_force_qs; 2656 rdp->n_force_qs_snap = rsp->n_force_qs;
2525 rdp->blimit = blimit; 2657 rdp->blimit = blimit;
2658 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2526 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2659 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2527 atomic_set(&rdp->dynticks->dynticks, 2660 atomic_set(&rdp->dynticks->dynticks,
2528 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2661 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
@@ -2555,7 +2688,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2555 rdp->completed = rnp->completed; 2688 rdp->completed = rnp->completed;
2556 rdp->passed_quiesce = 0; 2689 rdp->passed_quiesce = 0;
2557 rdp->qs_pending = 0; 2690 rdp->qs_pending = 0;
2558 rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
2559 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2691 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
2560 } 2692 }
2561 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2693 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
@@ -2594,12 +2726,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2594 break; 2726 break;
2595 case CPU_ONLINE: 2727 case CPU_ONLINE:
2596 case CPU_DOWN_FAILED: 2728 case CPU_DOWN_FAILED:
2597 rcu_node_kthread_setaffinity(rnp, -1); 2729 rcu_boost_kthread_setaffinity(rnp, -1);
2598 rcu_cpu_kthread_setrt(cpu, 1);
2599 break; 2730 break;
2600 case CPU_DOWN_PREPARE: 2731 case CPU_DOWN_PREPARE:
2601 rcu_node_kthread_setaffinity(rnp, cpu); 2732 rcu_boost_kthread_setaffinity(rnp, cpu);
2602 rcu_cpu_kthread_setrt(cpu, 0);
2603 break; 2733 break;
2604 case CPU_DYING: 2734 case CPU_DYING:
2605 case CPU_DYING_FROZEN: 2735 case CPU_DYING_FROZEN:
@@ -2627,6 +2757,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2627} 2757}
2628 2758
2629/* 2759/*
2760 * Spawn the kthread that handles this RCU flavor's grace periods.
2761 */
2762static int __init rcu_spawn_gp_kthread(void)
2763{
2764 unsigned long flags;
2765 struct rcu_node *rnp;
2766 struct rcu_state *rsp;
2767 struct task_struct *t;
2768
2769 for_each_rcu_flavor(rsp) {
2770 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
2771 BUG_ON(IS_ERR(t));
2772 rnp = rcu_get_root(rsp);
2773 raw_spin_lock_irqsave(&rnp->lock, flags);
2774 rsp->gp_kthread = t;
2775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2776 }
2777 return 0;
2778}
2779early_initcall(rcu_spawn_gp_kthread);
2780
2781/*
2630 * This function is invoked towards the end of the scheduler's initialization 2782 * This function is invoked towards the end of the scheduler's initialization
2631 * process. Before this is called, the idle task might contain 2783 * process. Before this is called, the idle task might contain
2632 * RCU read-side critical sections (during which time, this idle 2784 * RCU read-side critical sections (during which time, this idle
@@ -2661,7 +2813,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2661 int cprv; 2813 int cprv;
2662 int i; 2814 int i;
2663 2815
2664 cprv = NR_CPUS; 2816 cprv = nr_cpu_ids;
2665 for (i = rcu_num_lvls - 1; i >= 0; i--) { 2817 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2666 ccur = rsp->levelcnt[i]; 2818 ccur = rsp->levelcnt[i];
2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2819 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
@@ -2676,10 +2828,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2676static void __init rcu_init_one(struct rcu_state *rsp, 2828static void __init rcu_init_one(struct rcu_state *rsp,
2677 struct rcu_data __percpu *rda) 2829 struct rcu_data __percpu *rda)
2678{ 2830{
2679 static char *buf[] = { "rcu_node_level_0", 2831 static char *buf[] = { "rcu_node_0",
2680 "rcu_node_level_1", 2832 "rcu_node_1",
2681 "rcu_node_level_2", 2833 "rcu_node_2",
2682 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ 2834 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
2835 static char *fqs[] = { "rcu_node_fqs_0",
2836 "rcu_node_fqs_1",
2837 "rcu_node_fqs_2",
2838 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
2683 int cpustride = 1; 2839 int cpustride = 1;
2684 int i; 2840 int i;
2685 int j; 2841 int j;
@@ -2704,7 +2860,11 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2704 raw_spin_lock_init(&rnp->lock); 2860 raw_spin_lock_init(&rnp->lock);
2705 lockdep_set_class_and_name(&rnp->lock, 2861 lockdep_set_class_and_name(&rnp->lock,
2706 &rcu_node_class[i], buf[i]); 2862 &rcu_node_class[i], buf[i]);
2707 rnp->gpnum = 0; 2863 raw_spin_lock_init(&rnp->fqslock);
2864 lockdep_set_class_and_name(&rnp->fqslock,
2865 &rcu_fqs_class[i], fqs[i]);
2866 rnp->gpnum = rsp->gpnum;
2867 rnp->completed = rsp->completed;
2708 rnp->qsmask = 0; 2868 rnp->qsmask = 0;
2709 rnp->qsmaskinit = 0; 2869 rnp->qsmaskinit = 0;
2710 rnp->grplo = j * cpustride; 2870 rnp->grplo = j * cpustride;
@@ -2727,6 +2887,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2727 } 2887 }
2728 2888
2729 rsp->rda = rda; 2889 rsp->rda = rda;
2890 init_waitqueue_head(&rsp->gp_wq);
2730 rnp = rsp->level[rcu_num_lvls - 1]; 2891 rnp = rsp->level[rcu_num_lvls - 1];
2731 for_each_possible_cpu(i) { 2892 for_each_possible_cpu(i) {
2732 while (i > rnp->grphi) 2893 while (i > rnp->grphi)
@@ -2750,7 +2911,8 @@ static void __init rcu_init_geometry(void)
2750 int rcu_capacity[MAX_RCU_LVLS + 1]; 2911 int rcu_capacity[MAX_RCU_LVLS + 1];
2751 2912
2752 /* If the compile-time values are accurate, just leave. */ 2913 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) 2914 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
2915 nr_cpu_ids == NR_CPUS)
2754 return; 2916 return;
2755 2917
2756 /* 2918 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4d29169f2124..5faf05d68326 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,6 +102,10 @@ struct rcu_dynticks {
102 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105#ifdef CONFIG_RCU_USER_QS
106 bool ignore_user_qs; /* Treat userspace as extended QS or not */
107 bool in_user; /* Is the CPU in userland from RCU POV? */
108#endif
105}; 109};
106 110
107/* RCU's kthread states for tracing. */ 111/* RCU's kthread states for tracing. */
@@ -196,12 +200,7 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 200 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 201 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 202#endif /* #ifdef CONFIG_RCU_BOOST */
199 struct task_struct *node_kthread_task; 203 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200 /* kthread that takes care of this rcu_node */
201 /* structure, for example, awakening the */
202 /* per-CPU kthreads as needed. */
203 unsigned int node_kthread_status;
204 /* State of node_kthread_task for tracing. */
205} ____cacheline_internodealigned_in_smp; 204} ____cacheline_internodealigned_in_smp;
206 205
207/* 206/*
@@ -245,8 +244,6 @@ struct rcu_data {
245 /* in order to detect GP end. */ 244 /* in order to detect GP end. */
246 unsigned long gpnum; /* Highest gp number that this CPU */ 245 unsigned long gpnum; /* Highest gp number that this CPU */
247 /* is aware of having started. */ 246 /* is aware of having started. */
248 unsigned long passed_quiesce_gpnum;
249 /* gpnum at time of quiescent state. */
250 bool passed_quiesce; /* User-mode/idle loop etc. */ 247 bool passed_quiesce; /* User-mode/idle loop etc. */
251 bool qs_pending; /* Core waits for quiesc state. */ 248 bool qs_pending; /* Core waits for quiesc state. */
252 bool beenonline; /* CPU online at least once. */ 249 bool beenonline; /* CPU online at least once. */
@@ -312,11 +309,13 @@ struct rcu_data {
312 unsigned long n_rp_cpu_needs_gp; 309 unsigned long n_rp_cpu_needs_gp;
313 unsigned long n_rp_gp_completed; 310 unsigned long n_rp_gp_completed;
314 unsigned long n_rp_gp_started; 311 unsigned long n_rp_gp_started;
315 unsigned long n_rp_need_fqs;
316 unsigned long n_rp_need_nothing; 312 unsigned long n_rp_need_nothing;
317 313
318 /* 6) _rcu_barrier() callback. */ 314 /* 6) _rcu_barrier() and OOM callbacks. */
319 struct rcu_head barrier_head; 315 struct rcu_head barrier_head;
316#ifdef CONFIG_RCU_FAST_NO_HZ
317 struct rcu_head oom_head;
318#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
320 319
321 int cpu; 320 int cpu;
322 struct rcu_state *rsp; 321 struct rcu_state *rsp;
@@ -375,20 +374,17 @@ struct rcu_state {
375 374
376 u8 fqs_state ____cacheline_internodealigned_in_smp; 375 u8 fqs_state ____cacheline_internodealigned_in_smp;
377 /* Force QS state. */ 376 /* Force QS state. */
378 u8 fqs_active; /* force_quiescent_state() */
379 /* is running. */
380 u8 fqs_need_gp; /* A CPU was prevented from */
381 /* starting a new grace */
382 /* period because */
383 /* force_quiescent_state() */
384 /* was running. */
385 u8 boost; /* Subject to priority boost. */ 377 u8 boost; /* Subject to priority boost. */
386 unsigned long gpnum; /* Current gp number. */ 378 unsigned long gpnum; /* Current gp number. */
387 unsigned long completed; /* # of last completed gp. */ 379 unsigned long completed; /* # of last completed gp. */
380 struct task_struct *gp_kthread; /* Task for grace periods. */
381 wait_queue_head_t gp_wq; /* Where GP task waits. */
382 int gp_flags; /* Commands for GP task. */
388 383
389 /* End of fields guarded by root rcu_node's lock. */ 384 /* End of fields guarded by root rcu_node's lock. */
390 385
391 raw_spinlock_t onofflock; /* exclude on/offline and */ 386 raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp;
387 /* exclude on/offline and */
392 /* starting new GP. */ 388 /* starting new GP. */
393 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 389 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
394 /* need a grace period. */ 390 /* need a grace period. */
@@ -398,16 +394,11 @@ struct rcu_state {
398 struct rcu_head **orphan_donetail; /* Tail of above. */ 394 struct rcu_head **orphan_donetail; /* Tail of above. */
399 long qlen_lazy; /* Number of lazy callbacks. */ 395 long qlen_lazy; /* Number of lazy callbacks. */
400 long qlen; /* Total number of callbacks. */ 396 long qlen; /* Total number of callbacks. */
401 struct task_struct *rcu_barrier_in_progress;
402 /* Task doing rcu_barrier(), */
403 /* or NULL if no barrier. */
404 struct mutex barrier_mutex; /* Guards barrier fields. */ 397 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 398 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */ 399 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */ 400 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */ 401 /* _rcu_barrier(). */
409 raw_spinlock_t fqslock; /* Only one task forcing */
410 /* quiescent states. */
411 unsigned long jiffies_force_qs; /* Time at which to invoke */ 402 unsigned long jiffies_force_qs; /* Time at which to invoke */
412 /* force_quiescent_state(). */ 403 /* force_quiescent_state(). */
413 unsigned long n_force_qs; /* Number of calls to */ 404 unsigned long n_force_qs; /* Number of calls to */
@@ -426,6 +417,10 @@ struct rcu_state {
426 struct list_head flavors; /* List of RCU flavors. */ 417 struct list_head flavors; /* List of RCU flavors. */
427}; 418};
428 419
420/* Values for rcu_state structure's gp_flags field. */
421#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
422#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
423
429extern struct list_head rcu_struct_flavors; 424extern struct list_head rcu_struct_flavors;
430#define for_each_rcu_flavor(rsp) \ 425#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 426 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
@@ -468,7 +463,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
468#ifdef CONFIG_HOTPLUG_CPU 463#ifdef CONFIG_HOTPLUG_CPU
469static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 464static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
470 unsigned long flags); 465 unsigned long flags);
471static void rcu_stop_cpu_kthread(int cpu);
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 466#endif /* #ifdef CONFIG_HOTPLUG_CPU */
473static void rcu_print_detail_task_stall(struct rcu_state *rsp); 467static void rcu_print_detail_task_stall(struct rcu_state *rsp);
474static int rcu_print_task_stall(struct rcu_node *rnp); 468static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +485,9 @@ static void invoke_rcu_callbacks_kthread(void);
491static bool rcu_is_callbacks_kthread(void); 485static bool rcu_is_callbacks_kthread(void);
492#ifdef CONFIG_RCU_BOOST 486#ifdef CONFIG_RCU_BOOST
493static void rcu_preempt_do_callbacks(void); 487static void rcu_preempt_do_callbacks(void);
494static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
495 cpumask_var_t cm);
496static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 488static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
497 struct rcu_node *rnp, 489 struct rcu_node *rnp);
498 int rnp_index);
499static void invoke_rcu_node_kthread(struct rcu_node *rnp);
500static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
501#endif /* #ifdef CONFIG_RCU_BOOST */ 490#endif /* #ifdef CONFIG_RCU_BOOST */
502static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
503static void __cpuinit rcu_prepare_kthreads(int cpu); 491static void __cpuinit rcu_prepare_kthreads(int cpu);
504static void rcu_prepare_for_idle_init(int cpu); 492static void rcu_prepare_for_idle_init(int cpu);
505static void rcu_cleanup_after_idle(int cpu); 493static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c0df01..f92115488187 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,8 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/oom.h>
29#include <linux/smpboot.h>
28 30
29#define RCU_KTHREAD_PRIO 1 31#define RCU_KTHREAD_PRIO 1
30 32
@@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
118 */ 120 */
119void rcu_force_quiescent_state(void) 121void rcu_force_quiescent_state(void)
120{ 122{
121 force_quiescent_state(&rcu_preempt_state, 0); 123 force_quiescent_state(&rcu_preempt_state);
122} 124}
123EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 125EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
124 126
@@ -136,8 +138,6 @@ static void rcu_preempt_qs(int cpu)
136{ 138{
137 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 139 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
138 140
139 rdp->passed_quiesce_gpnum = rdp->gpnum;
140 barrier();
141 if (rdp->passed_quiesce == 0) 141 if (rdp->passed_quiesce == 0)
142 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 142 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
143 rdp->passed_quiesce = 1; 143 rdp->passed_quiesce = 1;
@@ -422,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
422 unsigned long flags; 422 unsigned long flags;
423 struct task_struct *t; 423 struct task_struct *t;
424 424
425 if (!rcu_preempt_blocked_readers_cgp(rnp))
426 return;
427 raw_spin_lock_irqsave(&rnp->lock, flags); 425 raw_spin_lock_irqsave(&rnp->lock, flags);
426 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
427 raw_spin_unlock_irqrestore(&rnp->lock, flags);
428 return;
429 }
428 t = list_entry(rnp->gp_tasks, 430 t = list_entry(rnp->gp_tasks,
429 struct task_struct, rcu_node_entry); 431 struct task_struct, rcu_node_entry);
430 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 432 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -584,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
584 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 586 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
585 } 587 }
586 588
589 rnp->gp_tasks = NULL;
590 rnp->exp_tasks = NULL;
587#ifdef CONFIG_RCU_BOOST 591#ifdef CONFIG_RCU_BOOST
588 /* In case root is being boosted and leaf is not. */ 592 rnp->boost_tasks = NULL;
593 /*
594 * In case root is being boosted and leaf was not. Make sure
595 * that we boost the tasks blocking the current grace period
596 * in this case.
597 */
589 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 598 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
590 if (rnp_root->boost_tasks != NULL && 599 if (rnp_root->boost_tasks != NULL &&
591 rnp_root->boost_tasks != rnp_root->gp_tasks) 600 rnp_root->boost_tasks != rnp_root->gp_tasks &&
601 rnp_root->boost_tasks != rnp_root->exp_tasks)
592 rnp_root->boost_tasks = rnp_root->gp_tasks; 602 rnp_root->boost_tasks = rnp_root->gp_tasks;
593 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 603 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
594#endif /* #ifdef CONFIG_RCU_BOOST */ 604#endif /* #ifdef CONFIG_RCU_BOOST */
595 605
596 rnp->gp_tasks = NULL;
597 rnp->exp_tasks = NULL;
598 return retval; 606 return retval;
599} 607}
600 608
@@ -676,7 +684,7 @@ void synchronize_rcu(void)
676EXPORT_SYMBOL_GPL(synchronize_rcu); 684EXPORT_SYMBOL_GPL(synchronize_rcu);
677 685
678static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 686static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
679static long sync_rcu_preempt_exp_count; 687static unsigned long sync_rcu_preempt_exp_count;
680static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 688static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
681 689
682/* 690/*
@@ -791,7 +799,7 @@ void synchronize_rcu_expedited(void)
791 unsigned long flags; 799 unsigned long flags;
792 struct rcu_node *rnp; 800 struct rcu_node *rnp;
793 struct rcu_state *rsp = &rcu_preempt_state; 801 struct rcu_state *rsp = &rcu_preempt_state;
794 long snap; 802 unsigned long snap;
795 int trycount = 0; 803 int trycount = 0;
796 804
797 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 805 smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -799,33 +807,47 @@ void synchronize_rcu_expedited(void)
799 smp_mb(); /* Above access cannot bleed into critical section. */ 807 smp_mb(); /* Above access cannot bleed into critical section. */
800 808
801 /* 809 /*
810 * Block CPU-hotplug operations. This means that any CPU-hotplug
811 * operation that finds an rcu_node structure with tasks in the
812 * process of being boosted will know that all tasks blocking
813 * this expedited grace period will already be in the process of
814 * being boosted. This simplifies the process of moving tasks
815 * from leaf to root rcu_node structures.
816 */
817 get_online_cpus();
818
819 /*
802 * Acquire lock, falling back to synchronize_rcu() if too many 820 * Acquire lock, falling back to synchronize_rcu() if too many
803 * lock-acquisition failures. Of course, if someone does the 821 * lock-acquisition failures. Of course, if someone does the
804 * expedited grace period for us, just leave. 822 * expedited grace period for us, just leave.
805 */ 823 */
806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 824 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
825 if (ULONG_CMP_LT(snap,
826 ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
827 put_online_cpus();
828 goto mb_ret; /* Others did our work for us. */
829 }
807 if (trycount++ < 10) { 830 if (trycount++ < 10) {
808 udelay(trycount * num_online_cpus()); 831 udelay(trycount * num_online_cpus());
809 } else { 832 } else {
833 put_online_cpus();
810 synchronize_rcu(); 834 synchronize_rcu();
811 return; 835 return;
812 } 836 }
813 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
814 goto mb_ret; /* Others did our work for us. */
815 } 837 }
816 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 838 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
839 put_online_cpus();
817 goto unlock_mb_ret; /* Others did our work for us. */ 840 goto unlock_mb_ret; /* Others did our work for us. */
841 }
818 842
819 /* force all RCU readers onto ->blkd_tasks lists. */ 843 /* force all RCU readers onto ->blkd_tasks lists. */
820 synchronize_sched_expedited(); 844 synchronize_sched_expedited();
821 845
822 raw_spin_lock_irqsave(&rsp->onofflock, flags);
823
824 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 846 /* Initialize ->expmask for all non-leaf rcu_node structures. */
825 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 847 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
826 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 848 raw_spin_lock_irqsave(&rnp->lock, flags);
827 rnp->expmask = rnp->qsmaskinit; 849 rnp->expmask = rnp->qsmaskinit;
828 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 850 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 } 851 }
830 852
831 /* Snapshot current state of ->blkd_tasks lists. */ 853 /* Snapshot current state of ->blkd_tasks lists. */
@@ -834,7 +856,7 @@ void synchronize_rcu_expedited(void)
834 if (NUM_RCU_NODES > 1) 856 if (NUM_RCU_NODES > 1)
835 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 857 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
836 858
837 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 859 put_online_cpus();
838 860
839 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 861 /* Wait for snapshotted ->blkd_tasks lists to drain. */
840 rnp = rcu_get_root(rsp); 862 rnp = rcu_get_root(rsp);
@@ -1069,6 +1091,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1069 1091
1070#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1092#endif /* #else #ifdef CONFIG_RCU_TRACE */
1071 1093
1094static void rcu_wake_cond(struct task_struct *t, int status)
1095{
1096 /*
1097 * If the thread is yielding, only wake it when this
1098 * is invoked from idle
1099 */
1100 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1101 wake_up_process(t);
1102}
1103
1072/* 1104/*
1073 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1105 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1074 * or ->boost_tasks, advancing the pointer to the next task in the 1106 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1141,17 +1173,6 @@ static int rcu_boost(struct rcu_node *rnp)
1141} 1173}
1142 1174
1143/* 1175/*
1144 * Timer handler to initiate waking up of boost kthreads that
1145 * have yielded the CPU due to excessive numbers of tasks to
1146 * boost. We wake up the per-rcu_node kthread, which in turn
1147 * will wake up the booster kthread.
1148 */
1149static void rcu_boost_kthread_timer(unsigned long arg)
1150{
1151 invoke_rcu_node_kthread((struct rcu_node *)arg);
1152}
1153
1154/*
1155 * Priority-boosting kthread. One per leaf rcu_node and one for the 1176 * Priority-boosting kthread. One per leaf rcu_node and one for the
1156 * root rcu_node. 1177 * root rcu_node.
1157 */ 1178 */
@@ -1174,8 +1195,9 @@ static int rcu_boost_kthread(void *arg)
1174 else 1195 else
1175 spincnt = 0; 1196 spincnt = 0;
1176 if (spincnt > 10) { 1197 if (spincnt > 10) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1177 trace_rcu_utilization("End boost kthread@rcu_yield"); 1199 trace_rcu_utilization("End boost kthread@rcu_yield");
1178 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); 1200 schedule_timeout_interruptible(2);
1179 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1201 trace_rcu_utilization("Start boost kthread@rcu_yield");
1180 spincnt = 0; 1202 spincnt = 0;
1181 } 1203 }
@@ -1191,9 +1213,9 @@ static int rcu_boost_kthread(void *arg)
1191 * kthread to start boosting them. If there is an expedited grace 1213 * kthread to start boosting them. If there is an expedited grace
1192 * period in progress, it is always time to boost. 1214 * period in progress, it is always time to boost.
1193 * 1215 *
1194 * The caller must hold rnp->lock, which this function releases, 1216 * The caller must hold rnp->lock, which this function releases.
1195 * but irqs remain disabled. The ->boost_kthread_task is immortal, 1217 * The ->boost_kthread_task is immortal, so we don't need to worry
1196 * so we don't need to worry about it going away. 1218 * about it going away.
1197 */ 1219 */
1198static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1220static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1199{ 1221{
@@ -1213,8 +1235,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1213 rnp->boost_tasks = rnp->gp_tasks; 1235 rnp->boost_tasks = rnp->gp_tasks;
1214 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1236 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1215 t = rnp->boost_kthread_task; 1237 t = rnp->boost_kthread_task;
1216 if (t != NULL) 1238 if (t)
1217 wake_up_process(t); 1239 rcu_wake_cond(t, rnp->boost_kthread_status);
1218 } else { 1240 } else {
1219 rcu_initiate_boost_trace(rnp); 1241 rcu_initiate_boost_trace(rnp);
1220 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1242 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1253,10 @@ static void invoke_rcu_callbacks_kthread(void)
1231 local_irq_save(flags); 1253 local_irq_save(flags);
1232 __this_cpu_write(rcu_cpu_has_work, 1); 1254 __this_cpu_write(rcu_cpu_has_work, 1);
1233 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1255 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1234 current != __this_cpu_read(rcu_cpu_kthread_task)) 1256 current != __this_cpu_read(rcu_cpu_kthread_task)) {
1235 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); 1257 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1258 __this_cpu_read(rcu_cpu_kthread_status));
1259 }
1236 local_irq_restore(flags); 1260 local_irq_restore(flags);
1237} 1261}
1238 1262
@@ -1245,21 +1269,6 @@ static bool rcu_is_callbacks_kthread(void)
1245 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1269 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1246} 1270}
1247 1271
1248/*
1249 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1250 * held, so no one should be messing with the existence of the boost
1251 * kthread.
1252 */
1253static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1254 cpumask_var_t cm)
1255{
1256 struct task_struct *t;
1257
1258 t = rnp->boost_kthread_task;
1259 if (t != NULL)
1260 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1261}
1262
1263#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1272#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1264 1273
1265/* 1274/*
@@ -1276,15 +1285,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1276 * Returns zero if all is well, a negated errno otherwise. 1285 * Returns zero if all is well, a negated errno otherwise.
1277 */ 1286 */
1278static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1287static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1279 struct rcu_node *rnp, 1288 struct rcu_node *rnp)
1280 int rnp_index)
1281{ 1289{
1290 int rnp_index = rnp - &rsp->node[0];
1282 unsigned long flags; 1291 unsigned long flags;
1283 struct sched_param sp; 1292 struct sched_param sp;
1284 struct task_struct *t; 1293 struct task_struct *t;
1285 1294
1286 if (&rcu_preempt_state != rsp) 1295 if (&rcu_preempt_state != rsp)
1287 return 0; 1296 return 0;
1297
1298 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1299 return 0;
1300
1288 rsp->boost = 1; 1301 rsp->boost = 1;
1289 if (rnp->boost_kthread_task != NULL) 1302 if (rnp->boost_kthread_task != NULL)
1290 return 0; 1303 return 0;
@@ -1301,25 +1314,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1301 return 0; 1314 return 0;
1302} 1315}
1303 1316
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306/*
1307 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1308 */
1309static void rcu_stop_cpu_kthread(int cpu)
1310{
1311 struct task_struct *t;
1312
1313 /* Stop the CPU's kthread. */
1314 t = per_cpu(rcu_cpu_kthread_task, cpu);
1315 if (t != NULL) {
1316 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1317 kthread_stop(t);
1318 }
1319}
1320
1321#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1322
1323static void rcu_kthread_do_work(void) 1317static void rcu_kthread_do_work(void)
1324{ 1318{
1325 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1319 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1321,22 @@ static void rcu_kthread_do_work(void)
1327 rcu_preempt_do_callbacks(); 1321 rcu_preempt_do_callbacks();
1328} 1322}
1329 1323
1330/* 1324static void rcu_cpu_kthread_setup(unsigned int cpu)
1331 * Wake up the specified per-rcu_node-structure kthread.
1332 * Because the per-rcu_node kthreads are immortal, we don't need
1333 * to do anything to keep them alive.
1334 */
1335static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1336{
1337 struct task_struct *t;
1338
1339 t = rnp->node_kthread_task;
1340 if (t != NULL)
1341 wake_up_process(t);
1342}
1343
1344/*
1345 * Set the specified CPU's kthread to run RT or not, as specified by
1346 * the to_rt argument. The CPU-hotplug locks are held, so the task
1347 * is not going away.
1348 */
1349static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1350{ 1325{
1351 int policy;
1352 struct sched_param sp; 1326 struct sched_param sp;
1353 struct task_struct *t;
1354 1327
1355 t = per_cpu(rcu_cpu_kthread_task, cpu); 1328 sp.sched_priority = RCU_KTHREAD_PRIO;
1356 if (t == NULL) 1329 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1357 return;
1358 if (to_rt) {
1359 policy = SCHED_FIFO;
1360 sp.sched_priority = RCU_KTHREAD_PRIO;
1361 } else {
1362 policy = SCHED_NORMAL;
1363 sp.sched_priority = 0;
1364 }
1365 sched_setscheduler_nocheck(t, policy, &sp);
1366} 1330}
1367 1331
1368/* 1332static void rcu_cpu_kthread_park(unsigned int cpu)
1369 * Timer handler to initiate the waking up of per-CPU kthreads that
1370 * have yielded the CPU due to excess numbers of RCU callbacks.
1371 * We wake up the per-rcu_node kthread, which in turn will wake up
1372 * the booster kthread.
1373 */
1374static void rcu_cpu_kthread_timer(unsigned long arg)
1375{ 1333{
1376 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); 1334 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1377 struct rcu_node *rnp = rdp->mynode;
1378
1379 atomic_or(rdp->grpmask, &rnp->wakemask);
1380 invoke_rcu_node_kthread(rnp);
1381} 1335}
1382 1336
1383/* 1337static int rcu_cpu_kthread_should_run(unsigned int cpu)
1384 * Drop to non-real-time priority and yield, but only after posting a
1385 * timer that will cause us to regain our real-time priority if we
1386 * remain preempted. Either way, we restore our real-time priority
1387 * before returning.
1388 */
1389static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1390{ 1338{
1391 struct sched_param sp; 1339 return __get_cpu_var(rcu_cpu_has_work);
1392 struct timer_list yield_timer;
1393 int prio = current->rt_priority;
1394
1395 setup_timer_on_stack(&yield_timer, f, arg);
1396 mod_timer(&yield_timer, jiffies + 2);
1397 sp.sched_priority = 0;
1398 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1399 set_user_nice(current, 19);
1400 schedule();
1401 set_user_nice(current, 0);
1402 sp.sched_priority = prio;
1403 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1404 del_timer(&yield_timer);
1405}
1406
1407/*
1408 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1409 * This can happen while the corresponding CPU is either coming online
1410 * or going offline. We cannot wait until the CPU is fully online
1411 * before starting the kthread, because the various notifier functions
1412 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1413 * the corresponding CPU is online.
1414 *
1415 * Return 1 if the kthread needs to stop, 0 otherwise.
1416 *
1417 * Caller must disable bh. This function can momentarily enable it.
1418 */
1419static int rcu_cpu_kthread_should_stop(int cpu)
1420{
1421 while (cpu_is_offline(cpu) ||
1422 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1423 smp_processor_id() != cpu) {
1424 if (kthread_should_stop())
1425 return 1;
1426 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1427 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1428 local_bh_enable();
1429 schedule_timeout_uninterruptible(1);
1430 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1431 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1432 local_bh_disable();
1433 }
1434 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1435 return 0;
1436} 1340}
1437 1341
1438/* 1342/*
@@ -1440,138 +1344,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
1440 * RCU softirq used in flavors and configurations of RCU that do not 1344 * RCU softirq used in flavors and configurations of RCU that do not
1441 * support RCU priority boosting. 1345 * support RCU priority boosting.
1442 */ 1346 */
1443static int rcu_cpu_kthread(void *arg) 1347static void rcu_cpu_kthread(unsigned int cpu)
1444{ 1348{
1445 int cpu = (int)(long)arg; 1349 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
1446 unsigned long flags; 1350 char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
1447 int spincnt = 0; 1351 int spincnt;
1448 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1449 char work;
1450 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1451 1352
1452 trace_rcu_utilization("Start CPU kthread@init"); 1353 for (spincnt = 0; spincnt < 10; spincnt++) {
1453 for (;;) {
1454 *statusp = RCU_KTHREAD_WAITING;
1455 trace_rcu_utilization("End CPU kthread@rcu_wait");
1456 rcu_wait(*workp != 0 || kthread_should_stop());
1457 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1354 trace_rcu_utilization("Start CPU kthread@rcu_wait");
1458 local_bh_disable(); 1355 local_bh_disable();
1459 if (rcu_cpu_kthread_should_stop(cpu)) {
1460 local_bh_enable();
1461 break;
1462 }
1463 *statusp = RCU_KTHREAD_RUNNING; 1356 *statusp = RCU_KTHREAD_RUNNING;
1464 per_cpu(rcu_cpu_kthread_loops, cpu)++; 1357 this_cpu_inc(rcu_cpu_kthread_loops);
1465 local_irq_save(flags); 1358 local_irq_disable();
1466 work = *workp; 1359 work = *workp;
1467 *workp = 0; 1360 *workp = 0;
1468 local_irq_restore(flags); 1361 local_irq_enable();
1469 if (work) 1362 if (work)
1470 rcu_kthread_do_work(); 1363 rcu_kthread_do_work();
1471 local_bh_enable(); 1364 local_bh_enable();
1472 if (*workp != 0) 1365 if (*workp == 0) {
1473 spincnt++; 1366 trace_rcu_utilization("End CPU kthread@rcu_wait");
1474 else 1367 *statusp = RCU_KTHREAD_WAITING;
1475 spincnt = 0; 1368 return;
1476 if (spincnt > 10) {
1477 *statusp = RCU_KTHREAD_YIELDING;
1478 trace_rcu_utilization("End CPU kthread@rcu_yield");
1479 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1480 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1481 spincnt = 0;
1482 }
1483 }
1484 *statusp = RCU_KTHREAD_STOPPED;
1485 trace_rcu_utilization("End CPU kthread@term");
1486 return 0;
1487}
1488
1489/*
1490 * Spawn a per-CPU kthread, setting up affinity and priority.
1491 * Because the CPU hotplug lock is held, no other CPU will be attempting
1492 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1493 * attempting to access it during boot, but the locking in kthread_bind()
1494 * will enforce sufficient ordering.
1495 *
1496 * Please note that we cannot simply refuse to wake up the per-CPU
1497 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1498 * which can result in softlockup complaints if the task ends up being
1499 * idle for more than a couple of minutes.
1500 *
1501 * However, please note also that we cannot bind the per-CPU kthread to its
1502 * CPU until that CPU is fully online. We also cannot wait until the
1503 * CPU is fully online before we create its per-CPU kthread, as this would
1504 * deadlock the system when CPU notifiers tried waiting for grace
1505 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1506 * is online. If its CPU is not yet fully online, then the code in
1507 * rcu_cpu_kthread() will wait until it is fully online, and then do
1508 * the binding.
1509 */
1510static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1511{
1512 struct sched_param sp;
1513 struct task_struct *t;
1514
1515 if (!rcu_scheduler_fully_active ||
1516 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1517 return 0;
1518 t = kthread_create_on_node(rcu_cpu_kthread,
1519 (void *)(long)cpu,
1520 cpu_to_node(cpu),
1521 "rcuc/%d", cpu);
1522 if (IS_ERR(t))
1523 return PTR_ERR(t);
1524 if (cpu_online(cpu))
1525 kthread_bind(t, cpu);
1526 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1527 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1528 sp.sched_priority = RCU_KTHREAD_PRIO;
1529 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1530 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1531 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1532 return 0;
1533}
1534
1535/*
1536 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1537 * kthreads when needed. We ignore requests to wake up kthreads
1538 * for offline CPUs, which is OK because force_quiescent_state()
1539 * takes care of this case.
1540 */
1541static int rcu_node_kthread(void *arg)
1542{
1543 int cpu;
1544 unsigned long flags;
1545 unsigned long mask;
1546 struct rcu_node *rnp = (struct rcu_node *)arg;
1547 struct sched_param sp;
1548 struct task_struct *t;
1549
1550 for (;;) {
1551 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1552 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1553 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1554 raw_spin_lock_irqsave(&rnp->lock, flags);
1555 mask = atomic_xchg(&rnp->wakemask, 0);
1556 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1557 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1558 if ((mask & 0x1) == 0)
1559 continue;
1560 preempt_disable();
1561 t = per_cpu(rcu_cpu_kthread_task, cpu);
1562 if (!cpu_online(cpu) || t == NULL) {
1563 preempt_enable();
1564 continue;
1565 }
1566 per_cpu(rcu_cpu_has_work, cpu) = 1;
1567 sp.sched_priority = RCU_KTHREAD_PRIO;
1568 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1569 preempt_enable();
1570 } 1369 }
1571 } 1370 }
1572 /* NOTREACHED */ 1371 *statusp = RCU_KTHREAD_YIELDING;
1573 rnp->node_kthread_status = RCU_KTHREAD_STOPPED; 1372 trace_rcu_utilization("Start CPU kthread@rcu_yield");
1574 return 0; 1373 schedule_timeout_interruptible(2);
1374 trace_rcu_utilization("End CPU kthread@rcu_yield");
1375 *statusp = RCU_KTHREAD_WAITING;
1575} 1376}
1576 1377
1577/* 1378/*
@@ -1583,17 +1384,17 @@ static int rcu_node_kthread(void *arg)
1583 * no outgoing CPU. If there are no CPUs left in the affinity set, 1384 * no outgoing CPU. If there are no CPUs left in the affinity set,
1584 * this function allows the kthread to execute on any CPU. 1385 * this function allows the kthread to execute on any CPU.
1585 */ 1386 */
1586static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1387static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1587{ 1388{
1389 struct task_struct *t = rnp->boost_kthread_task;
1390 unsigned long mask = rnp->qsmaskinit;
1588 cpumask_var_t cm; 1391 cpumask_var_t cm;
1589 int cpu; 1392 int cpu;
1590 unsigned long mask = rnp->qsmaskinit;
1591 1393
1592 if (rnp->node_kthread_task == NULL) 1394 if (!t)
1593 return; 1395 return;
1594 if (!alloc_cpumask_var(&cm, GFP_KERNEL)) 1396 if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1595 return; 1397 return;
1596 cpumask_clear(cm);
1597 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1398 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1598 if ((mask & 0x1) && cpu != outgoingcpu) 1399 if ((mask & 0x1) && cpu != outgoingcpu)
1599 cpumask_set_cpu(cpu, cm); 1400 cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1404,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1603 cpumask_clear_cpu(cpu, cm); 1404 cpumask_clear_cpu(cpu, cm);
1604 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1405 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1605 } 1406 }
1606 set_cpus_allowed_ptr(rnp->node_kthread_task, cm); 1407 set_cpus_allowed_ptr(t, cm);
1607 rcu_boost_kthread_setaffinity(rnp, cm);
1608 free_cpumask_var(cm); 1408 free_cpumask_var(cm);
1609} 1409}
1610 1410
1611/* 1411static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1612 * Spawn a per-rcu_node kthread, setting priority and affinity. 1412 .store = &rcu_cpu_kthread_task,
1613 * Called during boot before online/offline can happen, or, if 1413 .thread_should_run = rcu_cpu_kthread_should_run,
1614 * during runtime, with the main CPU-hotplug locks held. So only 1414 .thread_fn = rcu_cpu_kthread,
1615 * one of these can be executing at a time. 1415 .thread_comm = "rcuc/%u",
1616 */ 1416 .setup = rcu_cpu_kthread_setup,
1617static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, 1417 .park = rcu_cpu_kthread_park,
1618 struct rcu_node *rnp) 1418};
1619{
1620 unsigned long flags;
1621 int rnp_index = rnp - &rsp->node[0];
1622 struct sched_param sp;
1623 struct task_struct *t;
1624
1625 if (!rcu_scheduler_fully_active ||
1626 rnp->qsmaskinit == 0)
1627 return 0;
1628 if (rnp->node_kthread_task == NULL) {
1629 t = kthread_create(rcu_node_kthread, (void *)rnp,
1630 "rcun/%d", rnp_index);
1631 if (IS_ERR(t))
1632 return PTR_ERR(t);
1633 raw_spin_lock_irqsave(&rnp->lock, flags);
1634 rnp->node_kthread_task = t;
1635 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1636 sp.sched_priority = 99;
1637 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1638 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1639 }
1640 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1641}
1642 1419
1643/* 1420/*
1644 * Spawn all kthreads -- called as soon as the scheduler is running. 1421 * Spawn all kthreads -- called as soon as the scheduler is running.
1645 */ 1422 */
1646static int __init rcu_spawn_kthreads(void) 1423static int __init rcu_spawn_kthreads(void)
1647{ 1424{
1648 int cpu;
1649 struct rcu_node *rnp; 1425 struct rcu_node *rnp;
1426 int cpu;
1650 1427
1651 rcu_scheduler_fully_active = 1; 1428 rcu_scheduler_fully_active = 1;
1652 for_each_possible_cpu(cpu) { 1429 for_each_possible_cpu(cpu)
1653 per_cpu(rcu_cpu_has_work, cpu) = 0; 1430 per_cpu(rcu_cpu_has_work, cpu) = 0;
1654 if (cpu_online(cpu)) 1431 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1655 (void)rcu_spawn_one_cpu_kthread(cpu);
1656 }
1657 rnp = rcu_get_root(rcu_state); 1432 rnp = rcu_get_root(rcu_state);
1658 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1433 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1659 if (NUM_RCU_NODES > 1) { 1434 if (NUM_RCU_NODES > 1) {
1660 rcu_for_each_leaf_node(rcu_state, rnp) 1435 rcu_for_each_leaf_node(rcu_state, rnp)
1661 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1436 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1662 } 1437 }
1663 return 0; 1438 return 0;
1664} 1439}
@@ -1670,11 +1445,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1670 struct rcu_node *rnp = rdp->mynode; 1445 struct rcu_node *rnp = rdp->mynode;
1671 1446
1672 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1447 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1673 if (rcu_scheduler_fully_active) { 1448 if (rcu_scheduler_fully_active)
1674 (void)rcu_spawn_one_cpu_kthread(cpu); 1449 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1675 if (rnp->node_kthread_task == NULL)
1676 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1677 }
1678} 1450}
1679 1451
1680#else /* #ifdef CONFIG_RCU_BOOST */ 1452#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1470,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1698{ 1470{
1699} 1471}
1700 1472
1701#ifdef CONFIG_HOTPLUG_CPU 1473static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1702
1703static void rcu_stop_cpu_kthread(int cpu)
1704{
1705}
1706
1707#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1708
1709static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1710{
1711}
1712
1713static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1714{ 1474{
1715} 1475}
1716 1476
@@ -1997,6 +1757,26 @@ static void rcu_prepare_for_idle(int cpu)
1997 if (!tne) 1757 if (!tne)
1998 return; 1758 return;
1999 1759
1760 /* Adaptive-tick mode, where usermode execution is idle to RCU. */
1761 if (!is_idle_task(current)) {
1762 rdtp->dyntick_holdoff = jiffies - 1;
1763 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1764 trace_rcu_prep_idle("User dyntick with callbacks");
1765 rdtp->idle_gp_timer_expires =
1766 round_up(jiffies + RCU_IDLE_GP_DELAY,
1767 RCU_IDLE_GP_DELAY);
1768 } else if (rcu_cpu_has_callbacks(cpu)) {
1769 rdtp->idle_gp_timer_expires =
1770 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1771 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1772 } else {
1773 return;
1774 }
1775 tp = &rdtp->idle_gp_timer;
1776 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1777 return;
1778 }
1779
2000 /* 1780 /*
2001 * If this is an idle re-entry, for example, due to use of 1781 * If this is an idle re-entry, for example, due to use of
2002 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1782 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
@@ -2075,16 +1855,16 @@ static void rcu_prepare_for_idle(int cpu)
2075#ifdef CONFIG_TREE_PREEMPT_RCU 1855#ifdef CONFIG_TREE_PREEMPT_RCU
2076 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 1856 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2077 rcu_preempt_qs(cpu); 1857 rcu_preempt_qs(cpu);
2078 force_quiescent_state(&rcu_preempt_state, 0); 1858 force_quiescent_state(&rcu_preempt_state);
2079 } 1859 }
2080#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1860#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2081 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 1861 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2082 rcu_sched_qs(cpu); 1862 rcu_sched_qs(cpu);
2083 force_quiescent_state(&rcu_sched_state, 0); 1863 force_quiescent_state(&rcu_sched_state);
2084 } 1864 }
2085 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 1865 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2086 rcu_bh_qs(cpu); 1866 rcu_bh_qs(cpu);
2087 force_quiescent_state(&rcu_bh_state, 0); 1867 force_quiescent_state(&rcu_bh_state);
2088 } 1868 }
2089 1869
2090 /* 1870 /*
@@ -2112,6 +1892,88 @@ static void rcu_idle_count_callbacks_posted(void)
2112 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); 1892 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
2113} 1893}
2114 1894
1895/*
1896 * Data for flushing lazy RCU callbacks at OOM time.
1897 */
1898static atomic_t oom_callback_count;
1899static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1900
1901/*
1902 * RCU OOM callback -- decrement the outstanding count and deliver the
1903 * wake-up if we are the last one.
1904 */
1905static void rcu_oom_callback(struct rcu_head *rhp)
1906{
1907 if (atomic_dec_and_test(&oom_callback_count))
1908 wake_up(&oom_callback_wq);
1909}
1910
1911/*
1912 * Post an rcu_oom_notify callback on the current CPU if it has at
1913 * least one lazy callback. This will unnecessarily post callbacks
1914 * to CPUs that already have a non-lazy callback at the end of their
1915 * callback list, but this is an infrequent operation, so accept some
1916 * extra overhead to keep things simple.
1917 */
1918static void rcu_oom_notify_cpu(void *unused)
1919{
1920 struct rcu_state *rsp;
1921 struct rcu_data *rdp;
1922
1923 for_each_rcu_flavor(rsp) {
1924 rdp = __this_cpu_ptr(rsp->rda);
1925 if (rdp->qlen_lazy != 0) {
1926 atomic_inc(&oom_callback_count);
1927 rsp->call(&rdp->oom_head, rcu_oom_callback);
1928 }
1929 }
1930}
1931
1932/*
1933 * If low on memory, ensure that each CPU has a non-lazy callback.
1934 * This will wake up CPUs that have only lazy callbacks, in turn
1935 * ensuring that they free up the corresponding memory in a timely manner.
1936 * Because an uncertain amount of memory will be freed in some uncertain
1937 * timeframe, we do not claim to have freed anything.
1938 */
1939static int rcu_oom_notify(struct notifier_block *self,
1940 unsigned long notused, void *nfreed)
1941{
1942 int cpu;
1943
1944 /* Wait for callbacks from earlier instance to complete. */
1945 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1946
1947 /*
1948 * Prevent premature wakeup: ensure that all increments happen
1949 * before there is a chance of the counter reaching zero.
1950 */
1951 atomic_set(&oom_callback_count, 1);
1952
1953 get_online_cpus();
1954 for_each_online_cpu(cpu) {
1955 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1956 cond_resched();
1957 }
1958 put_online_cpus();
1959
1960 /* Unconditionally decrement: no need to wake ourselves up. */
1961 atomic_dec(&oom_callback_count);
1962
1963 return NOTIFY_OK;
1964}
1965
1966static struct notifier_block rcu_oom_nb = {
1967 .notifier_call = rcu_oom_notify
1968};
1969
1970static int __init rcu_register_oom_notifier(void)
1971{
1972 register_oom_notifier(&rcu_oom_nb);
1973 return 0;
1974}
1975early_initcall(rcu_register_oom_notifier);
1976
2115#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1977#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2116 1978
2117#ifdef CONFIG_RCU_CPU_STALL_INFO 1979#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2122,11 +1984,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2122{ 1984{
2123 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1985 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2124 struct timer_list *tltp = &rdtp->idle_gp_timer; 1986 struct timer_list *tltp = &rdtp->idle_gp_timer;
1987 char c;
2125 1988
2126 sprintf(cp, "drain=%d %c timer=%lu", 1989 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
2127 rdtp->dyntick_drain, 1990 if (timer_pending(tltp))
2128 rdtp->dyntick_holdoff == jiffies ? 'H' : '.', 1991 sprintf(cp, "drain=%d %c timer=%lu",
2129 timer_pending(tltp) ? tltp->expires - jiffies : -1); 1992 rdtp->dyntick_drain, c, tltp->expires - jiffies);
1993 else
1994 sprintf(cp, "drain=%d %c timer not pending",
1995 rdtp->dyntick_drain, c);
2130} 1996}
2131 1997
2132#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1998#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2194,11 +2060,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2194/* Increment ->ticks_this_gp for all flavors of RCU. */ 2060/* Increment ->ticks_this_gp for all flavors of RCU. */
2195static void increment_cpu_stall_ticks(void) 2061static void increment_cpu_stall_ticks(void)
2196{ 2062{
2197 __get_cpu_var(rcu_sched_data).ticks_this_gp++; 2063 struct rcu_state *rsp;
2198 __get_cpu_var(rcu_bh_data).ticks_this_gp++; 2064
2199#ifdef CONFIG_TREE_PREEMPT_RCU 2065 for_each_rcu_flavor(rsp)
2200 __get_cpu_var(rcu_preempt_data).ticks_this_gp++; 2066 __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
2201#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2202} 2067}
2203 2068
2204#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2069#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index abffb486e94e..693513bc50e6 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused)
51 struct rcu_state *rsp; 51 struct rcu_state *rsp;
52 52
53 for_each_rcu_flavor(rsp) 53 for_each_rcu_flavor(rsp)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", 54 seq_printf(m, "%s: bcc: %d nbd: %lu\n",
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', 55 rsp->name,
56 atomic_read(&rsp->barrier_cpu_count), 56 atomic_read(&rsp->barrier_cpu_count),
57 rsp->n_barrier_done); 57 rsp->n_barrier_done);
58 return 0; 58 return 0;
@@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
86{ 86{
87 if (!rdp->beenonline) 87 if (!rdp->beenonline)
88 return; 88 return;
89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", 89 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d",
90 rdp->cpu, 90 rdp->cpu,
91 cpu_is_offline(rdp->cpu) ? '!' : ' ', 91 cpu_is_offline(rdp->cpu) ? '!' : ' ',
92 rdp->completed, rdp->gpnum, 92 rdp->completed, rdp->gpnum,
93 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 93 rdp->passed_quiesce, rdp->qs_pending);
94 rdp->qs_pending);
95 seq_printf(m, " dt=%d/%llx/%d df=%lu", 94 seq_printf(m, " dt=%d/%llx/%d df=%lu",
96 atomic_read(&rdp->dynticks->dynticks), 95 atomic_read(&rdp->dynticks->dynticks),
97 rdp->dynticks->dynticks_nesting, 96 rdp->dynticks->dynticks_nesting,
@@ -108,11 +107,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
108 rdp->nxttail[RCU_WAIT_TAIL]], 107 rdp->nxttail[RCU_WAIT_TAIL]],
109 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); 108 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
110#ifdef CONFIG_RCU_BOOST 109#ifdef CONFIG_RCU_BOOST
111 seq_printf(m, " kt=%d/%c/%d ktl=%x", 110 seq_printf(m, " kt=%d/%c ktl=%x",
112 per_cpu(rcu_cpu_has_work, rdp->cpu), 111 per_cpu(rcu_cpu_has_work, rdp->cpu),
113 convert_kthread_status(per_cpu(rcu_cpu_kthread_status, 112 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
114 rdp->cpu)), 113 rdp->cpu)),
115 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
116 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); 114 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
117#endif /* #ifdef CONFIG_RCU_BOOST */ 115#endif /* #ifdef CONFIG_RCU_BOOST */
118 seq_printf(m, " b=%ld", rdp->blimit); 116 seq_printf(m, " b=%ld", rdp->blimit);
@@ -150,12 +148,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
150{ 148{
151 if (!rdp->beenonline) 149 if (!rdp->beenonline)
152 return; 150 return;
153 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", 151 seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
154 rdp->cpu, 152 rdp->cpu,
155 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 153 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
156 rdp->completed, rdp->gpnum, 154 rdp->completed, rdp->gpnum,
157 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 155 rdp->passed_quiesce, rdp->qs_pending);
158 rdp->qs_pending);
159 seq_printf(m, ",%d,%llx,%d,%lu", 156 seq_printf(m, ",%d,%llx,%d,%lu",
160 atomic_read(&rdp->dynticks->dynticks), 157 atomic_read(&rdp->dynticks->dynticks),
161 rdp->dynticks->dynticks_nesting, 158 rdp->dynticks->dynticks_nesting,
@@ -186,7 +183,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
186 int cpu; 183 int cpu;
187 struct rcu_state *rsp; 184 struct rcu_state *rsp;
188 185
189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 186 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\",");
190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 187 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 188 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
192#ifdef CONFIG_RCU_BOOST 189#ifdef CONFIG_RCU_BOOST
@@ -386,10 +383,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
386 rdp->n_rp_report_qs, 383 rdp->n_rp_report_qs,
387 rdp->n_rp_cb_ready, 384 rdp->n_rp_cb_ready,
388 rdp->n_rp_cpu_needs_gp); 385 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", 386 seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
390 rdp->n_rp_gp_completed, 387 rdp->n_rp_gp_completed,
391 rdp->n_rp_gp_started, 388 rdp->n_rp_gp_started,
392 rdp->n_rp_need_fqs,
393 rdp->n_rp_need_nothing); 389 rdp->n_rp_need_nothing);
394} 390}
395 391
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 649c9f876cb1..3c4dec0594d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2081#endif 2081#endif
2082 2082
2083 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch(prev, next);
2084 switch_to(prev, next, prev); 2085 switch_to(prev, next, prev);
2085 2086
2086 barrier(); 2087 barrier();
@@ -3468,6 +3469,21 @@ asmlinkage void __sched schedule(void)
3468} 3469}
3469EXPORT_SYMBOL(schedule); 3470EXPORT_SYMBOL(schedule);
3470 3471
3472#ifdef CONFIG_RCU_USER_QS
3473asmlinkage void __sched schedule_user(void)
3474{
3475 /*
3476 * If we come here after a random call to set_need_resched(),
3477 * or we have been woken up remotely but the IPI has not yet arrived,
3478 * we haven't yet exited the RCU idle mode. Do it here manually until
3479 * we find a better solution.
3480 */
3481 rcu_user_exit();
3482 schedule();
3483 rcu_user_enter();
3484}
3485#endif
3486
3471/** 3487/**
3472 * schedule_preempt_disabled - called with preemption disabled 3488 * schedule_preempt_disabled - called with preemption disabled
3473 * 3489 *
@@ -3569,6 +3585,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3569 /* Catch callers which need to be fixed */ 3585 /* Catch callers which need to be fixed */
3570 BUG_ON(ti->preempt_count || !irqs_disabled()); 3586 BUG_ON(ti->preempt_count || !irqs_disabled());
3571 3587
3588 rcu_user_exit();
3572 do { 3589 do {
3573 add_preempt_count(PREEMPT_ACTIVE); 3590 add_preempt_count(PREEMPT_ACTIVE);
3574 local_irq_enable(); 3591 local_irq_enable();
@@ -5604,7 +5621,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5604 migrate_tasks(cpu); 5621 migrate_tasks(cpu);
5605 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5622 BUG_ON(rq->nr_running != 1); /* the migration thread */
5606 raw_spin_unlock_irqrestore(&rq->lock, flags); 5623 raw_spin_unlock_irqrestore(&rq->lock, flags);
5624 break;
5607 5625
5626 case CPU_DEAD:
5608 calc_load_migrate(rq); 5627 calc_load_migrate(rq);
5609 break; 5628 break;
5610#endif 5629#endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1b..d6c5fc054242 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
1/* 1/*
2 * Common SMP CPU bringup/teardown functions 2 * Common SMP CPU bringup/teardown functions
3 */ 3 */
4#include <linux/cpu.h>
4#include <linux/err.h> 5#include <linux/err.h>
5#include <linux/smp.h> 6#include <linux/smp.h>
6#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/list.h>
9#include <linux/slab.h>
7#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/export.h>
8#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/kthread.h>
14#include <linux/smpboot.h>
9 15
10#include "smpboot.h" 16#include "smpboot.h"
11 17
18#ifdef CONFIG_SMP
19
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 20#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/* 21/*
14 * For the hotplug case we keep the task structs around and reuse 22 * For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
65 } 73 }
66} 74}
67#endif 75#endif
76
77#endif /* #ifdef CONFIG_SMP */
78
79static LIST_HEAD(hotplug_threads);
80static DEFINE_MUTEX(smpboot_threads_lock);
81
82struct smpboot_thread_data {
83 unsigned int cpu;
84 unsigned int status;
85 struct smp_hotplug_thread *ht;
86};
87
88enum {
89 HP_THREAD_NONE = 0,
90 HP_THREAD_ACTIVE,
91 HP_THREAD_PARKED,
92};
93
94/**
95 * smpboot_thread_fn - percpu hotplug thread loop function
96 * @data: thread data pointer
97 *
98 * Checks for thread stop and park conditions. Calls the necessary
99 * setup, cleanup, park and unpark functions for the registered
100 * thread.
101 *
102 * Returns 1 when the thread should exit, 0 otherwise.
103 */
104static int smpboot_thread_fn(void *data)
105{
106 struct smpboot_thread_data *td = data;
107 struct smp_hotplug_thread *ht = td->ht;
108
109 while (1) {
110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable();
112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING);
114 preempt_enable();
115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu));
117 kfree(td);
118 return 0;
119 }
120
121 if (kthread_should_park()) {
122 __set_current_state(TASK_RUNNING);
123 preempt_enable();
124 if (ht->park && td->status == HP_THREAD_ACTIVE) {
125 BUG_ON(td->cpu != smp_processor_id());
126 ht->park(td->cpu);
127 td->status = HP_THREAD_PARKED;
128 }
129 kthread_parkme();
130 /* We might have been woken for stop */
131 continue;
132 }
133
134 BUG_ON(td->cpu != smp_processor_id());
135
136 /* Check for state change setup */
137 switch (td->status) {
138 case HP_THREAD_NONE:
139 preempt_enable();
140 if (ht->setup)
141 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE;
143 preempt_disable();
144 break;
145 case HP_THREAD_PARKED:
146 preempt_enable();
147 if (ht->unpark)
148 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE;
150 preempt_disable();
151 break;
152 }
153
154 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable();
156 schedule();
157 } else {
158 set_current_state(TASK_RUNNING);
159 preempt_enable();
160 ht->thread_fn(td->cpu);
161 }
162 }
163}
164
165static int
166__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
167{
168 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
169 struct smpboot_thread_data *td;
170
171 if (tsk)
172 return 0;
173
174 td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
175 if (!td)
176 return -ENOMEM;
177 td->cpu = cpu;
178 td->ht = ht;
179
180 tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
181 ht->thread_comm);
182 if (IS_ERR(tsk)) {
183 kfree(td);
184 return PTR_ERR(tsk);
185 }
186
187 get_task_struct(tsk);
188 *per_cpu_ptr(ht->store, cpu) = tsk;
189 return 0;
190}
191
192int smpboot_create_threads(unsigned int cpu)
193{
194 struct smp_hotplug_thread *cur;
195 int ret = 0;
196
197 mutex_lock(&smpboot_threads_lock);
198 list_for_each_entry(cur, &hotplug_threads, list) {
199 ret = __smpboot_create_thread(cur, cpu);
200 if (ret)
201 break;
202 }
203 mutex_unlock(&smpboot_threads_lock);
204 return ret;
205}
206
207static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
208{
209 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
210
211 kthread_unpark(tsk);
212}
213
214void smpboot_unpark_threads(unsigned int cpu)
215{
216 struct smp_hotplug_thread *cur;
217
218 mutex_lock(&smpboot_threads_lock);
219 list_for_each_entry(cur, &hotplug_threads, list)
220 smpboot_unpark_thread(cur, cpu);
221 mutex_unlock(&smpboot_threads_lock);
222}
223
224static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
225{
226 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
227
228 if (tsk)
229 kthread_park(tsk);
230}
231
232void smpboot_park_threads(unsigned int cpu)
233{
234 struct smp_hotplug_thread *cur;
235
236 mutex_lock(&smpboot_threads_lock);
237 list_for_each_entry_reverse(cur, &hotplug_threads, list)
238 smpboot_park_thread(cur, cpu);
239 mutex_unlock(&smpboot_threads_lock);
240}
241
242static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
243{
244 unsigned int cpu;
245
246 /* We need to destroy also the parked threads of offline cpus */
247 for_each_possible_cpu(cpu) {
248 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
249
250 if (tsk) {
251 kthread_stop(tsk);
252 put_task_struct(tsk);
253 *per_cpu_ptr(ht->store, cpu) = NULL;
254 }
255 }
256}
257
258/**
259 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
260 * @plug_thread: Hotplug thread descriptor
261 *
262 * Creates and starts the threads on all online cpus.
263 */
264int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
265{
266 unsigned int cpu;
267 int ret = 0;
268
269 mutex_lock(&smpboot_threads_lock);
270 for_each_online_cpu(cpu) {
271 ret = __smpboot_create_thread(plug_thread, cpu);
272 if (ret) {
273 smpboot_destroy_threads(plug_thread);
274 goto out;
275 }
276 smpboot_unpark_thread(plug_thread, cpu);
277 }
278 list_add(&plug_thread->list, &hotplug_threads);
279out:
280 mutex_unlock(&smpboot_threads_lock);
281 return ret;
282}
283EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
284
285/**
286 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
287 * @plug_thread: Hotplug thread descriptor
288 *
289 * Stops all threads on all possible cpus.
290 */
291void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
292{
293 get_online_cpus();
294 mutex_lock(&smpboot_threads_lock);
295 list_del(&plug_thread->list);
296 smpboot_destroy_threads(plug_thread);
297 mutex_unlock(&smpboot_threads_lock);
298 put_online_cpus();
299}
300EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6ef9433e1c70..72415a0eb955 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
13static inline void idle_threads_init(void) { } 13static inline void idle_threads_init(void) { }
14#endif 14#endif
15 15
16int smpboot_create_threads(unsigned int cpu);
17void smpboot_park_threads(unsigned int cpu);
18void smpboot_unpark_threads(unsigned int cpu);
19
16#endif 20#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b73e681df09e..5c6a5bd8462f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smpboot.h>
26#include <linux/tick.h> 27#include <linux/tick.h>
27 28
28#define CREATE_TRACE_POINTS 29#define CREATE_TRACE_POINTS
@@ -742,49 +743,22 @@ void __init softirq_init(void)
742 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 743 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
743} 744}
744 745
745static int run_ksoftirqd(void * __bind_cpu) 746static int ksoftirqd_should_run(unsigned int cpu)
746{ 747{
747 set_current_state(TASK_INTERRUPTIBLE); 748 return local_softirq_pending();
748 749}
749 while (!kthread_should_stop()) {
750 preempt_disable();
751 if (!local_softirq_pending()) {
752 schedule_preempt_disabled();
753 }
754
755 __set_current_state(TASK_RUNNING);
756
757 while (local_softirq_pending()) {
758 /* Preempt disable stops cpu going offline.
759 If already offline, we'll be on wrong CPU:
760 don't process */
761 if (cpu_is_offline((long)__bind_cpu))
762 goto wait_to_die;
763 local_irq_disable();
764 if (local_softirq_pending())
765 __do_softirq();
766 local_irq_enable();
767 sched_preempt_enable_no_resched();
768 cond_resched();
769 preempt_disable();
770 rcu_note_context_switch((long)__bind_cpu);
771 }
772 preempt_enable();
773 set_current_state(TASK_INTERRUPTIBLE);
774 }
775 __set_current_state(TASK_RUNNING);
776 return 0;
777 750
778wait_to_die: 751static void run_ksoftirqd(unsigned int cpu)
779 preempt_enable(); 752{
780 /* Wait for kthread_stop */ 753 local_irq_disable();
781 set_current_state(TASK_INTERRUPTIBLE); 754 if (local_softirq_pending()) {
782 while (!kthread_should_stop()) { 755 __do_softirq();
783 schedule(); 756 rcu_note_context_switch(cpu);
784 set_current_state(TASK_INTERRUPTIBLE); 757 local_irq_enable();
758 cond_resched();
759 return;
785 } 760 }
786 __set_current_state(TASK_RUNNING); 761 local_irq_enable();
787 return 0;
788} 762}
789 763
790#ifdef CONFIG_HOTPLUG_CPU 764#ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
850 unsigned long action, 824 unsigned long action,
851 void *hcpu) 825 void *hcpu)
852{ 826{
853 int hotcpu = (unsigned long)hcpu;
854 struct task_struct *p;
855
856 switch (action) { 827 switch (action) {
857 case CPU_UP_PREPARE:
858 case CPU_UP_PREPARE_FROZEN:
859 p = kthread_create_on_node(run_ksoftirqd,
860 hcpu,
861 cpu_to_node(hotcpu),
862 "ksoftirqd/%d", hotcpu);
863 if (IS_ERR(p)) {
864 printk("ksoftirqd for %i failed\n", hotcpu);
865 return notifier_from_errno(PTR_ERR(p));
866 }
867 kthread_bind(p, hotcpu);
868 per_cpu(ksoftirqd, hotcpu) = p;
869 break;
870 case CPU_ONLINE:
871 case CPU_ONLINE_FROZEN:
872 wake_up_process(per_cpu(ksoftirqd, hotcpu));
873 break;
874#ifdef CONFIG_HOTPLUG_CPU 828#ifdef CONFIG_HOTPLUG_CPU
875 case CPU_UP_CANCELED:
876 case CPU_UP_CANCELED_FROZEN:
877 if (!per_cpu(ksoftirqd, hotcpu))
878 break;
879 /* Unbind so it can run. Fall thru. */
880 kthread_bind(per_cpu(ksoftirqd, hotcpu),
881 cpumask_any(cpu_online_mask));
882 case CPU_DEAD: 829 case CPU_DEAD:
883 case CPU_DEAD_FROZEN: { 830 case CPU_DEAD_FROZEN:
884 static const struct sched_param param = { 831 takeover_tasklets((unsigned long)hcpu);
885 .sched_priority = MAX_RT_PRIO-1
886 };
887
888 p = per_cpu(ksoftirqd, hotcpu);
889 per_cpu(ksoftirqd, hotcpu) = NULL;
890 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
891 kthread_stop(p);
892 takeover_tasklets(hotcpu);
893 break; 832 break;
894 }
895#endif /* CONFIG_HOTPLUG_CPU */ 833#endif /* CONFIG_HOTPLUG_CPU */
896 } 834 }
897 return NOTIFY_OK; 835 return NOTIFY_OK;
898} 836}
899 837
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
901 .notifier_call = cpu_callback 839 .notifier_call = cpu_callback
902}; 840};
903 841
842static struct smp_hotplug_thread softirq_threads = {
843 .store = &ksoftirqd,
844 .thread_should_run = ksoftirqd_should_run,
845 .thread_fn = run_ksoftirqd,
846 .thread_comm = "ksoftirqd/%u",
847};
848
904static __init int spawn_ksoftirqd(void) 849static __init int spawn_ksoftirqd(void)
905{ 850{
906 void *cpu = (void *)(long)smp_processor_id();
907 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
908
909 BUG_ON(err != NOTIFY_OK);
910 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
911 register_cpu_notifier(&cpu_nfb); 851 register_cpu_notifier(&cpu_nfb);
852
853 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
854
912 return 0; 855 return 0;
913} 856}
914early_initcall(spawn_ksoftirqd); 857early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3a9e5d5c1091..cf5f6b262673 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
437 static int ratelimit; 437 static int ratelimit;
438 438
439 if (ratelimit < 10) { 439 if (ratelimit < 10 &&
440 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
440 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 441 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
441 (unsigned int) local_softirq_pending()); 442 (unsigned int) local_softirq_pending());
442 ratelimit++; 443 ratelimit++;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..9d4c8d5a1f53 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25#include <linux/smpboot.h>
25 26
26#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
27#include <linux/kvm_para.h> 28#include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
29 30
30int watchdog_enabled = 1; 31int watchdog_enabled = 1;
31int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled;
32 34
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
35static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 37static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
36static DEFINE_PER_CPU(bool, softlockup_touch_sync); 38static DEFINE_PER_CPU(bool, softlockup_touch_sync);
37static DEFINE_PER_CPU(bool, soft_watchdog_warn); 39static DEFINE_PER_CPU(bool, soft_watchdog_warn);
40static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
41static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
38#ifdef CONFIG_HARDLOCKUP_DETECTOR 42#ifdef CONFIG_HARDLOCKUP_DETECTOR
39static DEFINE_PER_CPU(bool, hard_watchdog_warn); 43static DEFINE_PER_CPU(bool, hard_watchdog_warn);
40static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 44static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
41static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
42static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 45static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 46static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 47#endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
248 __this_cpu_write(hard_watchdog_warn, false); 251 __this_cpu_write(hard_watchdog_warn, false);
249 return; 252 return;
250} 253}
254#endif /* CONFIG_HARDLOCKUP_DETECTOR */
255
251static void watchdog_interrupt_count(void) 256static void watchdog_interrupt_count(void)
252{ 257{
253 __this_cpu_inc(hrtimer_interrupts); 258 __this_cpu_inc(hrtimer_interrupts);
254} 259}
255#else 260
256static inline void watchdog_interrupt_count(void) { return; } 261static int watchdog_nmi_enable(unsigned int cpu);
257#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 262static void watchdog_nmi_disable(unsigned int cpu);
258 263
259/* watchdog kicker functions */ 264/* watchdog kicker functions */
260static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 265static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 return HRTIMER_RESTART; 332 return HRTIMER_RESTART;
328} 333}
329 334
335static void watchdog_set_prio(unsigned int policy, unsigned int prio)
336{
337 struct sched_param param = { .sched_priority = prio };
330 338
331/* 339 sched_setscheduler(current, policy, &param);
332 * The watchdog thread - touches the timestamp. 340}
333 */ 341
334static int watchdog(void *unused) 342static void watchdog_enable(unsigned int cpu)
335{ 343{
336 struct sched_param param = { .sched_priority = 0 };
337 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
338 345
339 /* initialize timestamp */ 346 if (!watchdog_enabled) {
340 __touch_watchdog(); 347 kthread_park(current);
348 return;
349 }
350
351 /* Enable the perf event */
352 watchdog_nmi_enable(cpu);
341 353
342 /* kick off the timer for the hardlockup detector */ 354 /* kick off the timer for the hardlockup detector */
355 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
356 hrtimer->function = watchdog_timer_fn;
357
343 /* done here because hrtimer_start can only pin to smp_processor_id() */ 358 /* done here because hrtimer_start can only pin to smp_processor_id() */
344 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
345 HRTIMER_MODE_REL_PINNED); 360 HRTIMER_MODE_REL_PINNED);
346 361
347 set_current_state(TASK_INTERRUPTIBLE); 362 /* initialize timestamp */
348 /* 363 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
349 * Run briefly (kicked by the hrtimer callback function) once every 364 __touch_watchdog();
350 * get_sample_period() seconds (4 seconds by default) to reset the 365}
351 * softlockup timestamp. If this gets delayed for more than
352 * 2*watchdog_thresh seconds then the debug-printout triggers in
353 * watchdog_timer_fn().
354 */
355 while (!kthread_should_stop()) {
356 __touch_watchdog();
357 schedule();
358 366
359 if (kthread_should_stop()) 367static void watchdog_disable(unsigned int cpu)
360 break; 368{
369 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
361 370
362 set_current_state(TASK_INTERRUPTIBLE); 371 watchdog_set_prio(SCHED_NORMAL, 0);
363 } 372 hrtimer_cancel(hrtimer);
364 /* 373 /* disable the perf event */
365 * Drop the policy/priority elevation during thread exit to avoid a 374 watchdog_nmi_disable(cpu);
366 * scheduling latency spike.
367 */
368 __set_current_state(TASK_RUNNING);
369 sched_setscheduler(current, SCHED_NORMAL, &param);
370 return 0;
371} 375}
372 376
377static int watchdog_should_run(unsigned int cpu)
378{
379 return __this_cpu_read(hrtimer_interrupts) !=
380 __this_cpu_read(soft_lockup_hrtimer_cnt);
381}
382
383/*
384 * The watchdog thread function - touches the timestamp.
385 *
386 * It only runs once every get_sample_period() seconds (4 seconds by
387 * default) to reset the softlockup timestamp. If this gets delayed
388 * for more than 2*watchdog_thresh seconds then the debug-printout
389 * triggers in watchdog_timer_fn().
390 */
391static void watchdog(unsigned int cpu)
392{
393 __this_cpu_write(soft_lockup_hrtimer_cnt,
394 __this_cpu_read(hrtimer_interrupts));
395 __touch_watchdog();
396}
373 397
374#ifdef CONFIG_HARDLOCKUP_DETECTOR 398#ifdef CONFIG_HARDLOCKUP_DETECTOR
375/* 399/*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
379 */ 403 */
380static unsigned long cpu0_err; 404static unsigned long cpu0_err;
381 405
382static int watchdog_nmi_enable(int cpu) 406static int watchdog_nmi_enable(unsigned int cpu)
383{ 407{
384 struct perf_event_attr *wd_attr; 408 struct perf_event_attr *wd_attr;
385 struct perf_event *event = per_cpu(watchdog_ev, cpu); 409 struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
433 return 0; 457 return 0;
434} 458}
435 459
436static void watchdog_nmi_disable(int cpu) 460static void watchdog_nmi_disable(unsigned int cpu)
437{ 461{
438 struct perf_event *event = per_cpu(watchdog_ev, cpu); 462 struct perf_event *event = per_cpu(watchdog_ev, cpu);
439 463
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
447 return; 471 return;
448} 472}
449#else 473#else
450static int watchdog_nmi_enable(int cpu) { return 0; } 474static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
451static void watchdog_nmi_disable(int cpu) { return; } 475static void watchdog_nmi_disable(unsigned int cpu) { return; }
452#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 476#endif /* CONFIG_HARDLOCKUP_DETECTOR */
453 477
454/* prepare/enable/disable routines */ 478/* prepare/enable/disable routines */
455static void watchdog_prepare_cpu(int cpu)
456{
457 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
458
459 WARN_ON(per_cpu(softlockup_watchdog, cpu));
460 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
461 hrtimer->function = watchdog_timer_fn;
462}
463
464static int watchdog_enable(int cpu)
465{
466 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
467 int err = 0;
468
469 /* enable the perf event */
470 err = watchdog_nmi_enable(cpu);
471
472 /* Regardless of err above, fall through and start softlockup */
473
474 /* create the watchdog thread */
475 if (!p) {
476 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
477 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
478 if (IS_ERR(p)) {
479 pr_err("softlockup watchdog for %i failed\n", cpu);
480 if (!err) {
481 /* if hardlockup hasn't already set this */
482 err = PTR_ERR(p);
483 /* and disable the perf event */
484 watchdog_nmi_disable(cpu);
485 }
486 goto out;
487 }
488 sched_setscheduler(p, SCHED_FIFO, &param);
489 kthread_bind(p, cpu);
490 per_cpu(watchdog_touch_ts, cpu) = 0;
491 per_cpu(softlockup_watchdog, cpu) = p;
492 wake_up_process(p);
493 }
494
495out:
496 return err;
497}
498
499static void watchdog_disable(int cpu)
500{
501 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
502 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
503
504 /*
505 * cancel the timer first to stop incrementing the stats
506 * and waking up the kthread
507 */
508 hrtimer_cancel(hrtimer);
509
510 /* disable the perf event */
511 watchdog_nmi_disable(cpu);
512
513 /* stop the watchdog thread */
514 if (p) {
515 per_cpu(softlockup_watchdog, cpu) = NULL;
516 kthread_stop(p);
517 }
518}
519
520/* sysctl functions */ 479/* sysctl functions */
521#ifdef CONFIG_SYSCTL 480#ifdef CONFIG_SYSCTL
522static void watchdog_enable_all_cpus(void) 481static void watchdog_enable_all_cpus(void)
523{ 482{
524 int cpu; 483 unsigned int cpu;
525
526 watchdog_enabled = 0;
527
528 for_each_online_cpu(cpu)
529 if (!watchdog_enable(cpu))
530 /* if any cpu succeeds, watchdog is considered
531 enabled for the system */
532 watchdog_enabled = 1;
533
534 if (!watchdog_enabled)
535 pr_err("failed to be enabled on some cpus\n");
536 484
485 if (watchdog_disabled) {
486 watchdog_disabled = 0;
487 for_each_online_cpu(cpu)
488 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
489 }
537} 490}
538 491
539static void watchdog_disable_all_cpus(void) 492static void watchdog_disable_all_cpus(void)
540{ 493{
541 int cpu; 494 unsigned int cpu;
542
543 for_each_online_cpu(cpu)
544 watchdog_disable(cpu);
545 495
546 /* if all watchdogs are disabled, then they are disabled for the system */ 496 if (!watchdog_disabled) {
547 watchdog_enabled = 0; 497 watchdog_disabled = 1;
498 for_each_online_cpu(cpu)
499 kthread_park(per_cpu(softlockup_watchdog, cpu));
500 }
548} 501}
549 502
550
551/* 503/*
552 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh 504 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
553 */ 505 */
@@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
557{ 509{
558 int ret; 510 int ret;
559 511
512 if (watchdog_disabled < 0)
513 return -ENODEV;
514
560 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 515 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
561 if (ret || !write) 516 if (ret || !write)
562 goto out; 517 return ret;
563 518
564 if (watchdog_enabled && watchdog_thresh) 519 if (watchdog_enabled && watchdog_thresh)
565 watchdog_enable_all_cpus(); 520 watchdog_enable_all_cpus();
566 else 521 else
567 watchdog_disable_all_cpus(); 522 watchdog_disable_all_cpus();
568 523
569out:
570 return ret; 524 return ret;
571} 525}
572#endif /* CONFIG_SYSCTL */ 526#endif /* CONFIG_SYSCTL */
573 527
574 528static struct smp_hotplug_thread watchdog_threads = {
575/* 529 .store = &softlockup_watchdog,
576 * Create/destroy watchdog threads as CPUs come and go: 530 .thread_should_run = watchdog_should_run,
577 */ 531 .thread_fn = watchdog,
578static int __cpuinit 532 .thread_comm = "watchdog/%u",
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 533 .setup = watchdog_enable,
580{ 534 .park = watchdog_disable,
581 int hotcpu = (unsigned long)hcpu; 535 .unpark = watchdog_enable,
582
583 switch (action) {
584 case CPU_UP_PREPARE:
585 case CPU_UP_PREPARE_FROZEN:
586 watchdog_prepare_cpu(hotcpu);
587 break;
588 case CPU_ONLINE:
589 case CPU_ONLINE_FROZEN:
590 if (watchdog_enabled)
591 watchdog_enable(hotcpu);
592 break;
593#ifdef CONFIG_HOTPLUG_CPU
594 case CPU_UP_CANCELED:
595 case CPU_UP_CANCELED_FROZEN:
596 watchdog_disable(hotcpu);
597 break;
598 case CPU_DEAD:
599 case CPU_DEAD_FROZEN:
600 watchdog_disable(hotcpu);
601 break;
602#endif /* CONFIG_HOTPLUG_CPU */
603 }
604
605 /*
606 * hardlockup and softlockup are not important enough
607 * to block cpu bring up. Just always succeed and
608 * rely on printk output to flag problems.
609 */
610 return NOTIFY_OK;
611}
612
613static struct notifier_block __cpuinitdata cpu_nfb = {
614 .notifier_call = cpu_callback
615}; 536};
616 537
617void __init lockup_detector_init(void) 538void __init lockup_detector_init(void)
618{ 539{
619 void *cpu = (void *)(long)smp_processor_id(); 540 if (smpboot_register_percpu_thread(&watchdog_threads)) {
620 int err; 541 pr_err("Failed to create watchdog threads, disabled\n");
621 542 watchdog_disabled = -ENODEV;
622 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 543 }
623 WARN_ON(notifier_to_errno(err));
624
625 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
626 register_cpu_notifier(&cpu_nfb);
627
628 return;
629} 544}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2403a63b5da5..dacbbe4d7a80 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -629,6 +629,20 @@ config PROVE_RCU_REPEATEDLY
629 629
630 Say N if you are unsure. 630 Say N if you are unsure.
631 631
632config PROVE_RCU_DELAY
633 bool "RCU debugging: preemptible RCU race provocation"
634 depends on DEBUG_KERNEL && PREEMPT_RCU
635 default n
636 help
637 There is a class of races that involve an unlikely preemption
638 of __rcu_read_unlock() just after ->rcu_read_lock_nesting has
639 been set to INT_MIN. This feature inserts a delay at that
640 point to increase the probability of these races.
641
642 Say Y to increase probability of preemption of __rcu_read_unlock().
643
644 Say N if you are unsure.
645
632config SPARSE_RCU_POINTER 646config SPARSE_RCU_POINTER
633 bool "RCU debugging: sparse-based checks for pointer usage" 647 bool "RCU debugging: sparse-based checks for pointer usage"
634 default n 648 default n
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..0de83b4541e9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1483,13 +1483,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1483{ 1483{
1484 struct kmemleak_object *prev_obj = v; 1484 struct kmemleak_object *prev_obj = v;
1485 struct kmemleak_object *next_obj = NULL; 1485 struct kmemleak_object *next_obj = NULL;
1486 struct list_head *n = &prev_obj->object_list; 1486 struct kmemleak_object *obj = prev_obj;
1487 1487
1488 ++(*pos); 1488 ++(*pos);
1489 1489
1490 list_for_each_continue_rcu(n, &object_list) { 1490 list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
1491 struct kmemleak_object *obj =
1492 list_entry(n, struct kmemleak_object, object_list);
1493 if (get_object(obj)) { 1491 if (get_object(obj)) {
1494 next_obj = obj; 1492 next_obj = obj;
1495 break; 1493 break;