aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt77
-rw-r--r--Documentation/RCU/UP.txt34
-rw-r--r--Documentation/RCU/checklist.txt20
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/rcubarrier.txt7
-rw-r--r--Documentation/RCU/torture.txt23
-rw-r--r--Documentation/RCU/trace.txt7
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--arch/ia64/xen/time.c3
-rw-r--r--include/linux/cpu.h17
-rw-r--r--include/linux/hardirq.h4
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/rcuclassic.h178
-rw-r--r--include/linux/rcupdate.h98
-rw-r--r--include/linux/rcupreempt.h127
-rw-r--r--include/linux/rcupreempt_trace.h97
-rw-r--r--include/linux/rcutree.h262
-rw-r--r--include/linux/sched.h32
-rw-r--r--init/Kconfig46
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c273
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/sched.c131
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/timer.c3
-rw-r--r--lib/Kconfig.debug2
36 files changed, 1635 insertions, 3666 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 9f711d2df91b..d2b85237c76e 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -743,3 +743,80 @@ Revised:
743 RCU, realtime RCU, sleepable RCU, performance. 743 RCU, realtime RCU, sleepable RCU, performance.
744" 744"
745} 745}
746
747@article{PaulEMcKenney2008RCUOSR
748,author="Paul E. McKenney and Jonathan Walpole"
749,title="Introducing technology into the {Linux} kernel: a case study"
750,Year="2008"
751,journal="SIGOPS Oper. Syst. Rev."
752,volume="42"
753,number="5"
754,pages="4--17"
755,issn="0163-5980"
756,doi={http://doi.acm.org/10.1145/1400097.1400099}
757,publisher="ACM"
758,address="New York, NY, USA"
759,annotation={
760 Linux changed RCU to a far greater degree than RCU has changed Linux.
761}
762}
763
764@unpublished{PaulEMcKenney2008HierarchicalRCU
765,Author="Paul E. McKenney"
766,Title="Hierarchical {RCU}"
767,month="November"
768,day="3"
769,year="2008"
770,note="Available:
771\url{http://lwn.net/Articles/305782/}
772[Viewed November 6, 2008]"
773,annotation="
774 RCU with combining-tree-based grace-period detection,
775 permitting it to handle thousands of CPUs.
776"
777}
778
779@conference{PaulEMcKenney2009MaliciousURCU
780,Author="Paul E. McKenney"
781,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms"
782,Booktitle="linux.conf.au 2009"
783,month="January"
784,year="2009"
785,address="Hobart, Australia"
786,note="Available:
787\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf}
788[Viewed February 2, 2009]"
789,annotation="
790 Realtime RCU and torture-testing RCU uses.
791"
792}
793
794@unpublished{MathieuDesnoyers2009URCU
795,Author="Mathieu Desnoyers"
796,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}"
797,month="February"
798,day="5"
799,year="2009"
800,note="Available:
801\url{http://lkml.org/lkml/2009/2/5/572}
802\url{git://lttng.org/userspace-rcu.git}
803[Viewed February 20, 2009]"
804,annotation="
805 Mathieu Desnoyers's user-space RCU implementation.
806 git://lttng.org/userspace-rcu.git
807"
808}
809
810@unpublished{PaulEMcKenney2009BloatWatchRCU
811,Author="Paul E. McKenney"
812,Title="{RCU}: The {Bloatwatch} Edition"
813,month="March"
814,day="17"
815,year="2009"
816,note="Available:
817\url{http://lwn.net/Articles/323929/}
818[Viewed March 20, 2009]"
819,annotation="
820 Uniprocessor assumptions allow simplified RCU implementation.
821"
822}
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
index aab4a9ec3931..90ec5341ee98 100644
--- a/Documentation/RCU/UP.txt
+++ b/Documentation/RCU/UP.txt
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems
2 2
3 3
4A common misconception is that, on UP systems, the call_rcu() primitive 4A common misconception is that, on UP systems, the call_rcu() primitive
5may immediately invoke its function, and that the synchronize_rcu() 5may immediately invoke its function. The basis of this misconception
6primitive may return immediately. The basis of this misconception
7is that since there is only one CPU, it should not be necessary to 6is that since there is only one CPU, it should not be necessary to
8wait for anything else to get done, since there are no other CPUs for 7wait for anything else to get done, since there are no other CPUs for
9anything else to be happening on. Although this approach will -sort- -of- 8anything else to be happening on. Although this approach will -sort- -of-
10work a surprising amount of the time, it is a very bad idea in general. 9work a surprising amount of the time, it is a very bad idea in general.
11This document presents three examples that demonstrate exactly how bad an 10This document presents three examples that demonstrate exactly how bad
12idea this is. 11an idea this is.
13 12
14 13
15Example 1: softirq Suicide 14Example 1: softirq Suicide
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect?
82 81
83Summary 82Summary
84 83
85Permitting call_rcu() to immediately invoke its arguments or permitting 84Permitting call_rcu() to immediately invoke its arguments breaks RCU,
86synchronize_rcu() to immediately return breaks RCU, even on a UP system. 85even on a UP system. So do not do it! Even on a UP system, the RCU
87So do not do it! Even on a UP system, the RCU infrastructure -must- 86infrastructure -must- respect grace periods, and -must- invoke callbacks
88respect grace periods, and -must- invoke callbacks from a known environment 87from a known environment in which no locks are held.
89in which no locks are held. 88
89It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
90immediately on an UP system. It is also safe for synchronize_rcu()
91to return immediately on UP systems, except when running preemptable
92RCU.
93
94Quick Quiz #3: Why can't synchronize_rcu() return immediately on
95 UP systems running preemptable RCU?
90 96
91 97
92Answer to Quick Quiz #1: 98Answer to Quick Quiz #1:
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2:
117 callbacks acquire locks directly. However, a great many RCU 123 callbacks acquire locks directly. However, a great many RCU
118 callbacks do acquire locks -indirectly-, for example, via 124 callbacks do acquire locks -indirectly-, for example, via
119 the kfree() primitive. 125 the kfree() primitive.
126
127Answer to Quick Quiz #3:
128 Why can't synchronize_rcu() return immediately on UP systems
129 running preemptable RCU?
130
131 Because some other task might have been preempted in the middle
132 of an RCU read-side critical section. If synchronize_rcu()
133 simply immediately returned, it would prematurely signal the
134 end of the grace period, which would come as a nasty shock to
135 that other thread when it started running again.
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index accfe2f5247d..51525a30e8b4 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome!
11 structure is updated more than about 10% of the time, then 11 structure is updated more than about 10% of the time, then
12 you should strongly consider some other approach, unless 12 you should strongly consider some other approach, unless
13 detailed performance measurements show that RCU is nonetheless 13 detailed performance measurements show that RCU is nonetheless
14 the right tool for the job. 14 the right tool for the job. Yes, you might think of RCU
15 as simply cutting overhead off of the readers and imposing it
16 on the writers. That is exactly why normal uses of RCU will
17 do much more reading than updating.
15 18
16 Another exception is where performance is not an issue, and RCU 19 Another exception is where performance is not an issue, and RCU
17 provides a simpler implementation. An example of this situation 20 provides a simpler implementation. An example of this situation
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome!
240 instead need to use synchronize_irq() or synchronize_sched(). 243 instead need to use synchronize_irq() or synchronize_sched().
241 244
24212. Any lock acquired by an RCU callback must be acquired elsewhere 24512. Any lock acquired by an RCU callback must be acquired elsewhere
243 with irq disabled, e.g., via spin_lock_irqsave(). Failing to 246 with softirq disabled, e.g., via spin_lock_irqsave(),
244 disable irq on a given acquisition of that lock will result in 247 spin_lock_bh(), etc. Failing to disable irq on a given
245 deadlock as soon as the RCU callback happens to interrupt that 248 acquisition of that lock will result in deadlock as soon as the
246 acquisition's critical section. 249 RCU callback happens to interrupt that acquisition's critical
250 section.
247 251
24813. RCU callbacks can be and are executed in parallel. In many cases, 25213. RCU callbacks can be and are executed in parallel. In many cases,
249 the callback code simply wrappers around kfree(), so that this 253 the callback code simply wrappers around kfree(), so that this
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome!
310 Because these primitives only wait for pre-existing readers, 314 Because these primitives only wait for pre-existing readers,
311 it is the caller's responsibility to guarantee safety to 315 it is the caller's responsibility to guarantee safety to
312 any subsequent readers. 316 any subsequent readers.
317
31816. The various RCU read-side primitives do -not- contain memory
319 barriers. The CPU (and in some cases, the compiler) is free
320 to reorder code into and out of RCU read-side critical sections.
321 It is the responsibility of the RCU update-side primitives to
322 deal with this.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 7aa2002ade77..2a23523ce471 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed
36 executed in user mode, or executed in the idle loop, we can 36 executed in user mode, or executed in the idle loop, we can
37 safely free up that item. 37 safely free up that item.
38 38
39 Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking
42 within RCU read-side critical sections. SRCU also uses 42 within RCU read-side critical sections. SRCU also uses
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that?
79o I hear that RCU needs work in order to support realtime kernels? 79o I hear that RCU needs work in order to support realtime kernels?
80 80
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
83 However, work is in progress for enabling priority boosting of 83 parameter. However, work is in progress for enabling priority
84 preempted RCU read-side critical sections. This is needed if you 84 boosting of preempted RCU read-side critical sections. This is
85 have CPU-bound realtime threads. 85 needed if you have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
88 88
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 909602d409bb..e439a0edee22 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all
170the timers, and only then invoke rcu_barrier() to wait for any remaining 170the timers, and only then invoke rcu_barrier() to wait for any remaining
171RCU callbacks to complete. 171RCU callbacks to complete.
172 172
173Of course, if you module uses call_rcu_bh(), you will need to invoke
174rcu_barrier_bh() before unloading. Similarly, if your module uses
175call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
176unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
177call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
178rcu_barrier_bh(), and rcu_barrier_sched().
179
173 180
174Implementing rcu_barrier() 181Implementing rcu_barrier()
175 182
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index a342b6e1cc10..9dba3bb90e60 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
76 "rcu_sync" for rcu_read_lock() with synchronous reclamation, 76 "rcu_sync" for rcu_read_lock() with synchronous reclamation,
77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for 77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for 78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for
79 the "srcu_read_lock()" API, and "sched" for the use of 79 the "srcu_read_lock()" API, "sched" for the use of
80 preempt_disable() together with synchronize_sched(). 80 preempt_disable() together with synchronize_sched(),
81 and "sched_expedited" for the use of preempt_disable()
82 with synchronize_sched_expedited().
81 83
82verbose Enable debug printk()s. Default is disabled. 84verbose Enable debug printk()s. Default is disabled.
83 85
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The
162"idx" value maps the "old" and "current" values to the underlying array, 164"idx" value maps the "old" and "current" values to the underlying array,
163and is useful for debugging. 165and is useful for debugging.
164 166
167Similarly, sched_expedited RCU provides the following:
168
169 sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
170 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
171 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
172 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
173 state: -1 / 0:0 3:0 4:0
174
175As before, the first four lines are similar to those for RCU.
176The last line shows the task-migration state. The first number is
177-1 if synchronize_sched_expedited() is idle, -2 if in the process of
178posting wakeups to the migration kthreads, and N when waiting on CPU N.
179Each of the colon-separated fields following the "/" is a CPU:state pair.
180Valid states are "0" for idle, "1" for waiting for quiescent state,
181"2" for passed through quiescent state, and "3" when a race with a
182CPU-hotplug event forces use of the synchronize_sched() primitive.
183
165 184
166USAGE 185USAGE
167 186
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 02cced183b2d..187bbf10c923 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
191 191
192The output of "cat rcu/rcudata" looks as follows: 192The output of "cat rcu/rcudata" looks as follows:
193 193
194rcu: 194rcu_sched:
195rcu:
196 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 195 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
197 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 196 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
198 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 197 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format.
306 305
307The output of "cat rcu/rcugp" looks as follows: 306The output of "cat rcu/rcugp" looks as follows:
308 307
309rcu: completed=33062 gpnum=33063 308rcu_sched: completed=33062 gpnum=33063
310rcu_bh: completed=464 gpnum=464 309rcu_bh: completed=464 gpnum=464
311 310
312Again, this output is for both "rcu" and "rcu_bh". The fields are 311Again, this output is for both "rcu" and "rcu_bh". The fields are
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
413 412
414The output of "cat rcu/rcu_pending" looks as follows: 413The output of "cat rcu/rcu_pending" looks as follows:
415 414
416rcu: 415rcu_sched:
417 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 416 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
418 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 417 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
419 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 418 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 96170824a717..e41a7fecf0d3 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -136,10 +136,10 @@ rcu_read_lock()
136 Used by a reader to inform the reclaimer that the reader is 136 Used by a reader to inform the reclaimer that the reader is
137 entering an RCU read-side critical section. It is illegal 137 entering an RCU read-side critical section. It is illegal
138 to block while in an RCU read-side critical section, though 138 to block while in an RCU read-side critical section, though
139 kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side 139 kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
140 critical sections. Any RCU-protected data structure accessed 140 read-side critical sections. Any RCU-protected data structure
141 during an RCU read-side critical section is guaranteed to remain 141 accessed during an RCU read-side critical section is guaranteed to
142 unreclaimed for the full duration of that critical section. 142 remain unreclaimed for the full duration of that critical section.
143 Reference counts may be used in conjunction with RCU to maintain 143 Reference counts may be used in conjunction with RCU to maintain
144 longer-term references to data structures. 144 longer-term references to data structures.
145 145
@@ -785,6 +785,7 @@ RCU pointer/list traversal:
785 rcu_dereference 785 rcu_dereference
786 list_for_each_entry_rcu 786 list_for_each_entry_rcu
787 hlist_for_each_entry_rcu 787 hlist_for_each_entry_rcu
788 hlist_nulls_for_each_entry_rcu
788 789
789 list_for_each_continue_rcu (to be deprecated in favor of new 790 list_for_each_continue_rcu (to be deprecated in favor of new
790 list_for_each_entry_continue_rcu) 791 list_for_each_entry_continue_rcu)
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier
807 808
808 rcu_read_lock synchronize_net rcu_barrier 809 rcu_read_lock synchronize_net rcu_barrier
809 rcu_read_unlock synchronize_rcu 810 rcu_read_unlock synchronize_rcu
811 synchronize_rcu_expedited
810 call_rcu 812 call_rcu
811 813
812 814
813bh: Critical sections Grace period Barrier 815bh: Critical sections Grace period Barrier
814 816
815 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh 817 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
816 rcu_read_unlock_bh 818 rcu_read_unlock_bh synchronize_rcu_bh
819 synchronize_rcu_bh_expedited
817 820
818 821
819sched: Critical sections Grace period Barrier 822sched: Critical sections Grace period Barrier
820 823
821 [preempt_disable] synchronize_sched rcu_barrier_sched 824 rcu_read_lock_sched synchronize_sched rcu_barrier_sched
822 [and friends] call_rcu_sched 825 rcu_read_unlock_sched call_rcu_sched
826 [preempt_disable] synchronize_sched_expedited
827 [and friends]
823 828
824 829
825SRCU: Critical sections Grace period Barrier 830SRCU: Critical sections Grace period Barrier
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier
827 srcu_read_lock synchronize_srcu N/A 832 srcu_read_lock synchronize_srcu N/A
828 srcu_read_unlock 833 srcu_read_unlock
829 834
835SRCU: Initialization/cleanup
836 init_srcu_struct
837 cleanup_srcu_struct
830 838
831See the comment headers in the source code (or the docbook generated 839See the comment headers in the source code (or the docbook generated
832from them) for more information. 840from them) for more information.
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index fb8332690179..dbeadb9c8e20 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -133,8 +133,7 @@ consider_steal_time(unsigned long new_itm)
133 account_idle_ticks(blocked); 133 account_idle_ticks(blocked);
134 run_local_timers(); 134 run_local_timers();
135 135
136 if (rcu_pending(cpu)) 136 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
137 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
138 137
139 scheduler_tick(); 138 scheduler_tick();
140 run_posix_cpu_timers(p); 139 run_posix_cpu_timers(p);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4d668e05d458..47536197ffdd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,15 @@ struct notifier_block;
48 48
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50/* Need to know about CPUs going up/down? */ 50/* Need to know about CPUs going up/down? */
51#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
52#define cpu_notifier(fn, pri) { \
53 static struct notifier_block fn##_nb __cpuinitdata = \
54 { .notifier_call = fn, .priority = pri }; \
55 register_cpu_notifier(&fn##_nb); \
56}
57#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
58#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
59#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
51#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
52extern int register_cpu_notifier(struct notifier_block *nb); 61extern int register_cpu_notifier(struct notifier_block *nb);
53extern void unregister_cpu_notifier(struct notifier_block *nb); 62extern void unregister_cpu_notifier(struct notifier_block *nb);
@@ -74,6 +83,8 @@ extern void cpu_maps_update_done(void);
74 83
75#else /* CONFIG_SMP */ 84#else /* CONFIG_SMP */
76 85
86#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
87
77static inline int register_cpu_notifier(struct notifier_block *nb) 88static inline int register_cpu_notifier(struct notifier_block *nb)
78{ 89{
79 return 0; 90 return 0;
@@ -99,11 +110,7 @@ extern struct sysdev_class cpu_sysdev_class;
99 110
100extern void get_online_cpus(void); 111extern void get_online_cpus(void);
101extern void put_online_cpus(void); 112extern void put_online_cpus(void);
102#define hotcpu_notifier(fn, pri) { \ 113#define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
103 static struct notifier_block fn##_nb __cpuinitdata = \
104 { .notifier_call = fn, .priority = pri }; \
105 register_cpu_notifier(&fn##_nb); \
106}
107#define register_hotcpu_notifier(nb) register_cpu_notifier(nb) 114#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
108#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) 115#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
109int cpu_down(unsigned int cpu); 116int cpu_down(unsigned int cpu);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 8246c697863d..330cb31bb496 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -132,7 +132,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
132} 132}
133#endif 133#endif
134 134
135#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) 135#if defined(CONFIG_NO_HZ)
136extern void rcu_irq_enter(void); 136extern void rcu_irq_enter(void);
137extern void rcu_irq_exit(void); 137extern void rcu_irq_exit(void);
138extern void rcu_nmi_enter(void); 138extern void rcu_nmi_enter(void);
@@ -142,7 +142,7 @@ extern void rcu_nmi_exit(void);
142# define rcu_irq_exit() do { } while (0) 142# define rcu_irq_exit() do { } while (0)
143# define rcu_nmi_enter() do { } while (0) 143# define rcu_nmi_enter() do { } while (0)
144# define rcu_nmi_exit() do { } while (0) 144# define rcu_nmi_exit() do { } while (0)
145#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */ 145#endif /* #if defined(CONFIG_NO_HZ) */
146 146
147/* 147/*
148 * It is safe to do non-atomic ops on ->hardirq_context, 148 * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7fc01b13be43..9e7f2e8fc66e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -94,6 +94,16 @@ extern struct group_info init_groups;
94# define CAP_INIT_BSET CAP_INIT_EFF_SET 94# define CAP_INIT_BSET CAP_INIT_EFF_SET
95#endif 95#endif
96 96
97#ifdef CONFIG_TREE_PREEMPT_RCU
98#define INIT_TASK_RCU_PREEMPT(tsk) \
99 .rcu_read_lock_nesting = 0, \
100 .rcu_read_unlock_special = 0, \
101 .rcu_blocked_node = NULL, \
102 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
103#else
104#define INIT_TASK_RCU_PREEMPT(tsk)
105#endif
106
97extern struct cred init_cred; 107extern struct cred init_cred;
98 108
99#ifdef CONFIG_PERF_COUNTERS 109#ifdef CONFIG_PERF_COUNTERS
@@ -173,6 +183,7 @@ extern struct cred init_cred;
173 INIT_LOCKDEP \ 183 INIT_LOCKDEP \
174 INIT_FTRACE_GRAPH \ 184 INIT_FTRACE_GRAPH \
175 INIT_TRACE_RECURSION \ 185 INIT_TRACE_RECURSION \
186 INIT_TASK_RCU_PREEMPT(tsk) \
176} 187}
177 188
178 189
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index aec3252afcf5..ed5d7501e181 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -132,7 +132,7 @@ static inline int page_cache_get_speculative(struct page *page)
132{ 132{
133 VM_BUG_ON(in_interrupt()); 133 VM_BUG_ON(in_interrupt());
134 134
135#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) 135#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
136# ifdef CONFIG_PREEMPT 136# ifdef CONFIG_PREEMPT
137 VM_BUG_ON(!in_atomic()); 137 VM_BUG_ON(!in_atomic());
138# endif 138# endif
@@ -170,7 +170,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
170{ 170{
171 VM_BUG_ON(in_interrupt()); 171 VM_BUG_ON(in_interrupt());
172 172
173#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) 173#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
174# ifdef CONFIG_PREEMPT 174# ifdef CONFIG_PREEMPT
175 VM_BUG_ON(!in_atomic()); 175 VM_BUG_ON(!in_atomic());
176# endif 176# endif
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
deleted file mode 100644
index bfd92e1e5d2c..000000000000
--- a/include/linux/rcuclassic.h
+++ /dev/null
@@ -1,178 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (classic version)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Author: Dipankar Sarma <dipankar@in.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of Read-Copy Update mechanism see -
29 * Documentation/RCU
30 *
31 */
32
33#ifndef __LINUX_RCUCLASSIC_H
34#define __LINUX_RCUCLASSIC_H
35
36#include <linux/cache.h>
37#include <linux/spinlock.h>
38#include <linux/threads.h>
39#include <linux/cpumask.h>
40#include <linux/seqlock.h>
41
42#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
43#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
44#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
45#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
46
47/* Global control variables for rcupdate callback mechanism. */
48struct rcu_ctrlblk {
49 long cur; /* Current batch number. */
50 long completed; /* Number of the last completed batch */
51 long pending; /* Number of the last pending batch */
52#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
53 unsigned long gp_start; /* Time at which GP started in jiffies. */
54 unsigned long jiffies_stall;
55 /* Time at which to check for CPU stalls. */
56#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
57
58 int signaled;
59
60 spinlock_t lock ____cacheline_internodealigned_in_smp;
61 DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */
62 /* current batch to proceed. */
63} ____cacheline_internodealigned_in_smp;
64
65/* Is batch a before batch b ? */
66static inline int rcu_batch_before(long a, long b)
67{
68 return (a - b) < 0;
69}
70
71/* Is batch a after batch b ? */
72static inline int rcu_batch_after(long a, long b)
73{
74 return (a - b) > 0;
75}
76
77/* Per-CPU data for Read-Copy UPdate. */
78struct rcu_data {
79 /* 1) quiescent state handling : */
80 long quiescbatch; /* Batch # for grace period */
81 int passed_quiesc; /* User-mode/idle loop etc. */
82 int qs_pending; /* core waits for quiesc state */
83
84 /* 2) batch handling */
85 /*
86 * if nxtlist is not NULL, then:
87 * batch:
88 * The batch # for the last entry of nxtlist
89 * [*nxttail[1], NULL = *nxttail[2]):
90 * Entries that batch # <= batch
91 * [*nxttail[0], *nxttail[1]):
92 * Entries that batch # <= batch - 1
93 * [nxtlist, *nxttail[0]):
94 * Entries that batch # <= batch - 2
95 * The grace period for these entries has completed, and
96 * the other grace-period-completed entries may be moved
97 * here temporarily in rcu_process_callbacks().
98 */
99 long batch;
100 struct rcu_head *nxtlist;
101 struct rcu_head **nxttail[3];
102 long qlen; /* # of queued callbacks */
103 struct rcu_head *donelist;
104 struct rcu_head **donetail;
105 long blimit; /* Upper limit on a processed batch */
106 int cpu;
107 struct rcu_head barrier;
108};
109
110/*
111 * Increment the quiescent state counter.
112 * The counter is a bit degenerated: We do not need to know
113 * how many quiescent states passed, just if there was at least
114 * one since the start of the grace period. Thus just a flag.
115 */
116extern void rcu_qsctr_inc(int cpu);
117extern void rcu_bh_qsctr_inc(int cpu);
118
119extern int rcu_pending(int cpu);
120extern int rcu_needs_cpu(int cpu);
121
122#ifdef CONFIG_DEBUG_LOCK_ALLOC
123extern struct lockdep_map rcu_lock_map;
124# define rcu_read_acquire() \
125 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
126# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
127#else
128# define rcu_read_acquire() do { } while (0)
129# define rcu_read_release() do { } while (0)
130#endif
131
132#define __rcu_read_lock() \
133 do { \
134 preempt_disable(); \
135 __acquire(RCU); \
136 rcu_read_acquire(); \
137 } while (0)
138#define __rcu_read_unlock() \
139 do { \
140 rcu_read_release(); \
141 __release(RCU); \
142 preempt_enable(); \
143 } while (0)
144#define __rcu_read_lock_bh() \
145 do { \
146 local_bh_disable(); \
147 __acquire(RCU_BH); \
148 rcu_read_acquire(); \
149 } while (0)
150#define __rcu_read_unlock_bh() \
151 do { \
152 rcu_read_release(); \
153 __release(RCU_BH); \
154 local_bh_enable(); \
155 } while (0)
156
157#define __synchronize_sched() synchronize_rcu()
158
159#define call_rcu_sched(head, func) call_rcu(head, func)
160
161extern void __rcu_init(void);
162#define rcu_init_sched() do { } while (0)
163extern void rcu_check_callbacks(int cpu, int user);
164extern void rcu_restart_cpu(int cpu);
165
166extern long rcu_batches_completed(void);
167extern long rcu_batches_completed_bh(void);
168
169#define rcu_enter_nohz() do { } while (0)
170#define rcu_exit_nohz() do { } while (0)
171
172/* A context switch is a grace period for rcuclassic. */
173static inline int rcu_blocking_is_gp(void)
174{
175 return num_online_cpus() == 1;
176}
177
178#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 15fbb3ca634d..95e0615f4d75 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,18 +51,26 @@ struct rcu_head {
51 void (*func)(struct rcu_head *head); 51 void (*func)(struct rcu_head *head);
52}; 52};
53 53
54/* Internal to kernel, but needed by rcupreempt.h. */ 54/* Exported common interfaces */
55extern void synchronize_rcu(void);
56extern void synchronize_rcu_bh(void);
57extern void rcu_barrier(void);
58extern void rcu_barrier_bh(void);
59extern void rcu_barrier_sched(void);
60extern void synchronize_sched_expedited(void);
61extern int sched_expedited_torture_stats(char *page);
62
63/* Internal to kernel */
64extern void rcu_init(void);
65extern void rcu_scheduler_starting(void);
66extern int rcu_needs_cpu(int cpu);
55extern int rcu_scheduler_active; 67extern int rcu_scheduler_active;
56 68
57#if defined(CONFIG_CLASSIC_RCU) 69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
58#include <linux/rcuclassic.h>
59#elif defined(CONFIG_TREE_RCU)
60#include <linux/rcutree.h> 70#include <linux/rcutree.h>
61#elif defined(CONFIG_PREEMPT_RCU)
62#include <linux/rcupreempt.h>
63#else 71#else
64#error "Unknown RCU implementation specified to kernel configuration" 72#error "Unknown RCU implementation specified to kernel configuration"
65#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ 73#endif
66 74
67#define RCU_HEAD_INIT { .next = NULL, .func = NULL } 75#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
68#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT 76#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
@@ -70,6 +78,16 @@ extern int rcu_scheduler_active;
70 (ptr)->next = NULL; (ptr)->func = NULL; \ 78 (ptr)->next = NULL; (ptr)->func = NULL; \
71} while (0) 79} while (0)
72 80
81#ifdef CONFIG_DEBUG_LOCK_ALLOC
82extern struct lockdep_map rcu_lock_map;
83# define rcu_read_acquire() \
84 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
85# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
86#else
87# define rcu_read_acquire() do { } while (0)
88# define rcu_read_release() do { } while (0)
89#endif
90
73/** 91/**
74 * rcu_read_lock - mark the beginning of an RCU read-side critical section. 92 * rcu_read_lock - mark the beginning of an RCU read-side critical section.
75 * 93 *
@@ -99,7 +117,12 @@ extern int rcu_scheduler_active;
99 * 117 *
100 * It is illegal to block while in an RCU read-side critical section. 118 * It is illegal to block while in an RCU read-side critical section.
101 */ 119 */
102#define rcu_read_lock() __rcu_read_lock() 120static inline void rcu_read_lock(void)
121{
122 __rcu_read_lock();
123 __acquire(RCU);
124 rcu_read_acquire();
125}
103 126
104/** 127/**
105 * rcu_read_unlock - marks the end of an RCU read-side critical section. 128 * rcu_read_unlock - marks the end of an RCU read-side critical section.
@@ -116,7 +139,12 @@ extern int rcu_scheduler_active;
116 * used as well. RCU does not care how the writers keep out of each 139 * used as well. RCU does not care how the writers keep out of each
117 * others' way, as long as they do so. 140 * others' way, as long as they do so.
118 */ 141 */
119#define rcu_read_unlock() __rcu_read_unlock() 142static inline void rcu_read_unlock(void)
143{
144 rcu_read_release();
145 __release(RCU);
146 __rcu_read_unlock();
147}
120 148
121/** 149/**
122 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section 150 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@@ -129,14 +157,24 @@ extern int rcu_scheduler_active;
129 * can use just rcu_read_lock(). 157 * can use just rcu_read_lock().
130 * 158 *
131 */ 159 */
132#define rcu_read_lock_bh() __rcu_read_lock_bh() 160static inline void rcu_read_lock_bh(void)
161{
162 __rcu_read_lock_bh();
163 __acquire(RCU_BH);
164 rcu_read_acquire();
165}
133 166
134/* 167/*
135 * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section 168 * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
136 * 169 *
137 * See rcu_read_lock_bh() for more information. 170 * See rcu_read_lock_bh() for more information.
138 */ 171 */
139#define rcu_read_unlock_bh() __rcu_read_unlock_bh() 172static inline void rcu_read_unlock_bh(void)
173{
174 rcu_read_release();
175 __release(RCU_BH);
176 __rcu_read_unlock_bh();
177}
140 178
141/** 179/**
142 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section 180 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
@@ -147,17 +185,34 @@ extern int rcu_scheduler_active;
147 * - call_rcu_sched() and rcu_barrier_sched() 185 * - call_rcu_sched() and rcu_barrier_sched()
148 * on the write-side to insure proper synchronization. 186 * on the write-side to insure proper synchronization.
149 */ 187 */
150#define rcu_read_lock_sched() preempt_disable() 188static inline void rcu_read_lock_sched(void)
151#define rcu_read_lock_sched_notrace() preempt_disable_notrace() 189{
190 preempt_disable();
191 __acquire(RCU_SCHED);
192 rcu_read_acquire();
193}
194static inline notrace void rcu_read_lock_sched_notrace(void)
195{
196 preempt_disable_notrace();
197 __acquire(RCU_SCHED);
198}
152 199
153/* 200/*
154 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section 201 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
155 * 202 *
156 * See rcu_read_lock_sched for more information. 203 * See rcu_read_lock_sched for more information.
157 */ 204 */
158#define rcu_read_unlock_sched() preempt_enable() 205static inline void rcu_read_unlock_sched(void)
159#define rcu_read_unlock_sched_notrace() preempt_enable_notrace() 206{
160 207 rcu_read_release();
208 __release(RCU_SCHED);
209 preempt_enable();
210}
211static inline notrace void rcu_read_unlock_sched_notrace(void)
212{
213 __release(RCU_SCHED);
214 preempt_enable_notrace();
215}
161 216
162 217
163/** 218/**
@@ -259,15 +314,4 @@ extern void call_rcu(struct rcu_head *head,
259extern void call_rcu_bh(struct rcu_head *head, 314extern void call_rcu_bh(struct rcu_head *head,
260 void (*func)(struct rcu_head *head)); 315 void (*func)(struct rcu_head *head));
261 316
262/* Exported common interfaces */
263extern void synchronize_rcu(void);
264extern void rcu_barrier(void);
265extern void rcu_barrier_bh(void);
266extern void rcu_barrier_sched(void);
267
268/* Internal to kernel */
269extern void rcu_init(void);
270extern void rcu_scheduler_starting(void);
271extern int rcu_needs_cpu(int cpu);
272
273#endif /* __LINUX_RCUPDATE_H */ 317#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
deleted file mode 100644
index fce522782ffa..000000000000
--- a/include/linux/rcupreempt.h
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of Read-Copy Update mechanism see -
29 * Documentation/RCU
30 *
31 */
32
33#ifndef __LINUX_RCUPREEMPT_H
34#define __LINUX_RCUPREEMPT_H
35
36#include <linux/cache.h>
37#include <linux/spinlock.h>
38#include <linux/threads.h>
39#include <linux/smp.h>
40#include <linux/cpumask.h>
41#include <linux/seqlock.h>
42
43extern void rcu_qsctr_inc(int cpu);
44static inline void rcu_bh_qsctr_inc(int cpu) { }
45
46/*
47 * Someone might want to pass call_rcu_bh as a function pointer.
48 * So this needs to just be a rename and not a macro function.
49 * (no parentheses)
50 */
51#define call_rcu_bh call_rcu
52
53/**
54 * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
55 * @head: structure to be used for queueing the RCU updates.
56 * @func: actual update function to be invoked after the grace period
57 *
58 * The update function will be invoked some time after a full
59 * synchronize_sched()-style grace period elapses, in other words after
60 * all currently executing preempt-disabled sections of code (including
61 * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
62 * completed.
63 */
64extern void call_rcu_sched(struct rcu_head *head,
65 void (*func)(struct rcu_head *head));
66
67extern void __rcu_read_lock(void) __acquires(RCU);
68extern void __rcu_read_unlock(void) __releases(RCU);
69extern int rcu_pending(int cpu);
70extern int rcu_needs_cpu(int cpu);
71
72#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
73#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
74
75extern void __synchronize_sched(void);
76
77extern void __rcu_init(void);
78extern void rcu_init_sched(void);
79extern void rcu_check_callbacks(int cpu, int user);
80extern void rcu_restart_cpu(int cpu);
81extern long rcu_batches_completed(void);
82
83/*
84 * Return the number of RCU batches processed thus far. Useful for debug
85 * and statistic. The _bh variant is identifcal to straight RCU
86 */
87static inline long rcu_batches_completed_bh(void)
88{
89 return rcu_batches_completed();
90}
91
92#ifdef CONFIG_RCU_TRACE
93struct rcupreempt_trace;
94extern long *rcupreempt_flipctr(int cpu);
95extern long rcupreempt_data_completed(void);
96extern int rcupreempt_flip_flag(int cpu);
97extern int rcupreempt_mb_flag(int cpu);
98extern char *rcupreempt_try_flip_state_name(void);
99extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
100#endif
101
102struct softirq_action;
103
104#ifdef CONFIG_NO_HZ
105extern void rcu_enter_nohz(void);
106extern void rcu_exit_nohz(void);
107#else
108# define rcu_enter_nohz() do { } while (0)
109# define rcu_exit_nohz() do { } while (0)
110#endif
111
112/*
113 * A context switch is a grace period for rcupreempt synchronize_rcu()
114 * only during early boot, before the scheduler has been initialized.
115 * So, how the heck do we get a context switch? Well, if the caller
116 * invokes synchronize_rcu(), they are willing to accept a context
117 * switch, so we simply pretend that one happened.
118 *
119 * After boot, there might be a blocked or preempted task in an RCU
120 * read-side critical section, so we cannot then take the fastpath.
121 */
122static inline int rcu_blocking_is_gp(void)
123{
124 return num_online_cpus() == 1 && !rcu_scheduler_active;
125}
126
127#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h
deleted file mode 100644
index b99ae073192a..000000000000
--- a/include/linux/rcupreempt_trace.h
+++ /dev/null
@@ -1,97 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
29 * http://lwn.net/Articles/253651/
30 */
31
32#ifndef __LINUX_RCUPREEMPT_TRACE_H
33#define __LINUX_RCUPREEMPT_TRACE_H
34
35#include <linux/types.h>
36#include <linux/kernel.h>
37
38#include <asm/atomic.h>
39
40/*
41 * PREEMPT_RCU data structures.
42 */
43
44struct rcupreempt_trace {
45 long next_length;
46 long next_add;
47 long wait_length;
48 long wait_add;
49 long done_length;
50 long done_add;
51 long done_remove;
52 atomic_t done_invoked;
53 long rcu_check_callbacks;
54 atomic_t rcu_try_flip_1;
55 atomic_t rcu_try_flip_e1;
56 long rcu_try_flip_i1;
57 long rcu_try_flip_ie1;
58 long rcu_try_flip_g1;
59 long rcu_try_flip_a1;
60 long rcu_try_flip_ae1;
61 long rcu_try_flip_a2;
62 long rcu_try_flip_z1;
63 long rcu_try_flip_ze1;
64 long rcu_try_flip_z2;
65 long rcu_try_flip_m1;
66 long rcu_try_flip_me1;
67 long rcu_try_flip_m2;
68};
69
70#ifdef CONFIG_RCU_TRACE
71#define RCU_TRACE(fn, arg) fn(arg);
72#else
73#define RCU_TRACE(fn, arg)
74#endif
75
76extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
77extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
78extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
79extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
80extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
81extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
82extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
83extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
84extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
85extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
86extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
87extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
88extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
89extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
90extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
91extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
92extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
93extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
94extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
95extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
96
97#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5a5153806c42..a89307717825 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,264 +30,57 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33#include <linux/cache.h> 33extern void rcu_sched_qs(int cpu);
34#include <linux/spinlock.h> 34extern void rcu_bh_qs(int cpu);
35#include <linux/threads.h>
36#include <linux/cpumask.h>
37#include <linux/seqlock.h>
38 35
39/* 36extern int rcu_needs_cpu(int cpu);
40 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
41 * In theory, it should be possible to add more levels straightforwardly.
42 * In practice, this has not been tested, so there is probably some
43 * bug somewhere.
44 */
45#define MAX_RCU_LVLS 3
46#define RCU_FANOUT (CONFIG_RCU_FANOUT)
47#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
48#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
49
50#if NR_CPUS <= RCU_FANOUT
51# define NUM_RCU_LVLS 1
52# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 (NR_CPUS)
54# define NUM_RCU_LVL_2 0
55# define NUM_RCU_LVL_3 0
56#elif NR_CPUS <= RCU_FANOUT_SQ
57# define NUM_RCU_LVLS 2
58# define NUM_RCU_LVL_0 1
59# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
60# define NUM_RCU_LVL_2 (NR_CPUS)
61# define NUM_RCU_LVL_3 0
62#elif NR_CPUS <= RCU_FANOUT_CUBE
63# define NUM_RCU_LVLS 3
64# define NUM_RCU_LVL_0 1
65# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
66# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
67# define NUM_RCU_LVL_3 NR_CPUS
68#else
69# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
70#endif /* #if (NR_CPUS) <= RCU_FANOUT */
71
72#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
73#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
74
75/*
76 * Dynticks per-CPU state.
77 */
78struct rcu_dynticks {
79 int dynticks_nesting; /* Track nesting level, sort of. */
80 int dynticks; /* Even value for dynticks-idle, else odd. */
81 int dynticks_nmi; /* Even value for either dynticks-idle or */
82 /* not in nmi handler, else odd. So this */
83 /* remains even for nmi from irq handler. */
84};
85
86/*
87 * Definition for node within the RCU grace-period-detection hierarchy.
88 */
89struct rcu_node {
90 spinlock_t lock;
91 unsigned long qsmask; /* CPUs or groups that need to switch in */
92 /* order for current grace period to proceed.*/
93 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 int grplo; /* lowest-numbered CPU or group here. */
97 int grphi; /* highest-numbered CPU or group here. */
98 u8 grpnum; /* CPU/group number for next level up. */
99 u8 level; /* root is at level 0. */
100 struct rcu_node *parent;
101} ____cacheline_internodealigned_in_smp;
102
103/* Index values for nxttail array in struct rcu_data. */
104#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
105#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
106#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
107#define RCU_NEXT_TAIL 3
108#define RCU_NEXT_SIZE 4
109
110/* Per-CPU data for read-copy update. */
111struct rcu_data {
112 /* 1) quiescent-state and grace-period handling : */
113 long completed; /* Track rsp->completed gp number */
114 /* in order to detect GP end. */
115 long gpnum; /* Highest gp number that this CPU */
116 /* is aware of having started. */
117 long passed_quiesc_completed;
118 /* Value of completed at time of qs. */
119 bool passed_quiesc; /* User-mode/idle loop etc. */
120 bool qs_pending; /* Core waits for quiesc state. */
121 bool beenonline; /* CPU online at least once. */
122 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
123 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
124
125 /* 2) batch handling */
126 /*
127 * If nxtlist is not NULL, it is partitioned as follows.
128 * Any of the partitions might be empty, in which case the
129 * pointer to that partition will be equal to the pointer for
130 * the following partition. When the list is empty, all of
131 * the nxttail elements point to nxtlist, which is NULL.
132 *
133 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
134 * Entries that might have arrived after current GP ended
135 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
136 * Entries known to have arrived before current GP ended
137 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
138 * Entries that batch # <= ->completed - 1: waiting for current GP
139 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
140 * Entries that batch # <= ->completed
141 * The grace period for these entries has completed, and
142 * the other grace-period-completed entries may be moved
143 * here temporarily in rcu_process_callbacks().
144 */
145 struct rcu_head *nxtlist;
146 struct rcu_head **nxttail[RCU_NEXT_SIZE];
147 long qlen; /* # of queued callbacks */
148 long blimit; /* Upper limit on a processed batch */
149
150#ifdef CONFIG_NO_HZ
151 /* 3) dynticks interface. */
152 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
153 int dynticks_snap; /* Per-GP tracking for dynticks. */
154 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
155#endif /* #ifdef CONFIG_NO_HZ */
156
157 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
158#ifdef CONFIG_NO_HZ
159 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
160#endif /* #ifdef CONFIG_NO_HZ */
161 unsigned long offline_fqs; /* Kicked due to being offline. */
162 unsigned long resched_ipi; /* Sent a resched IPI. */
163
164 /* 5) __rcu_pending() statistics. */
165 long n_rcu_pending; /* rcu_pending() calls since boot. */
166 long n_rp_qs_pending;
167 long n_rp_cb_ready;
168 long n_rp_cpu_needs_gp;
169 long n_rp_gp_completed;
170 long n_rp_gp_started;
171 long n_rp_need_fqs;
172 long n_rp_need_nothing;
173
174 int cpu;
175};
176
177/* Values for signaled field in struct rcu_state. */
178#define RCU_GP_INIT 0 /* Grace period being initialized. */
179#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
180#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
181#ifdef CONFIG_NO_HZ
182#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
183#else /* #ifdef CONFIG_NO_HZ */
184#define RCU_SIGNAL_INIT RCU_FORCE_QS
185#endif /* #else #ifdef CONFIG_NO_HZ */
186
187#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
188#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
189#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
190#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
191#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
192 /* to take at least one */
193 /* scheduling clock irq */
194 /* before ratting on them. */
195
196#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
197
198/*
199 * RCU global state, including node hierarchy. This hierarchy is
200 * represented in "heap" form in a dense array. The root (first level)
201 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
202 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
203 * and the third level in ->node[m+1] and following (->node[m+1] referenced
204 * by ->level[2]). The number of levels is determined by the number of
205 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
206 * consisting of a single rcu_node.
207 */
208struct rcu_state {
209 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
210 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
211 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
212 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
213 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
214
215 /* The following fields are guarded by the root rcu_node's lock. */
216
217 u8 signaled ____cacheline_internodealigned_in_smp;
218 /* Force QS state. */
219 long gpnum; /* Current gp number. */
220 long completed; /* # of last completed gp. */
221 spinlock_t onofflock; /* exclude on/offline and */
222 /* starting new GP. */
223 spinlock_t fqslock; /* Only one task forcing */
224 /* quiescent states. */
225 unsigned long jiffies_force_qs; /* Time at which to invoke */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs; /* Number of calls to */
228 /* force_quiescent_state(). */
229 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
230 /* due to lock unavailable. */
231 unsigned long n_force_qs_ngp; /* Number of calls leaving */
232 /* due to no GP active. */
233#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
234 unsigned long gp_start; /* Time at which GP started, */
235 /* but in jiffies. */
236 unsigned long jiffies_stall; /* Time at which to check */
237 /* for CPU stalls. */
238#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
239#ifdef CONFIG_NO_HZ
240 long dynticks_completed; /* Value of completed @ snap. */
241#endif /* #ifdef CONFIG_NO_HZ */
242};
243 37
244extern void rcu_qsctr_inc(int cpu); 38#ifdef CONFIG_TREE_PREEMPT_RCU
245extern void rcu_bh_qsctr_inc(int cpu);
246 39
247extern int rcu_pending(int cpu); 40extern void __rcu_read_lock(void);
248extern int rcu_needs_cpu(int cpu); 41extern void __rcu_read_unlock(void);
42extern void exit_rcu(void);
249 43
250#ifdef CONFIG_DEBUG_LOCK_ALLOC 44#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
251extern struct lockdep_map rcu_lock_map;
252# define rcu_read_acquire() \
253 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
254# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
255#else
256# define rcu_read_acquire() do { } while (0)
257# define rcu_read_release() do { } while (0)
258#endif
259 45
260static inline void __rcu_read_lock(void) 46static inline void __rcu_read_lock(void)
261{ 47{
262 preempt_disable(); 48 preempt_disable();
263 __acquire(RCU);
264 rcu_read_acquire();
265} 49}
50
266static inline void __rcu_read_unlock(void) 51static inline void __rcu_read_unlock(void)
267{ 52{
268 rcu_read_release();
269 __release(RCU);
270 preempt_enable(); 53 preempt_enable();
271} 54}
55
56static inline void exit_rcu(void)
57{
58}
59
60#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
61
272static inline void __rcu_read_lock_bh(void) 62static inline void __rcu_read_lock_bh(void)
273{ 63{
274 local_bh_disable(); 64 local_bh_disable();
275 __acquire(RCU_BH);
276 rcu_read_acquire();
277} 65}
278static inline void __rcu_read_unlock_bh(void) 66static inline void __rcu_read_unlock_bh(void)
279{ 67{
280 rcu_read_release();
281 __release(RCU_BH);
282 local_bh_enable(); 68 local_bh_enable();
283} 69}
284 70
285#define __synchronize_sched() synchronize_rcu() 71#define __synchronize_sched() synchronize_rcu()
286 72
287#define call_rcu_sched(head, func) call_rcu(head, func) 73extern void call_rcu_sched(struct rcu_head *head,
74 void (*func)(struct rcu_head *rcu));
288 75
289static inline void rcu_init_sched(void) 76static inline void synchronize_rcu_expedited(void)
290{ 77{
78 synchronize_sched_expedited();
79}
80
81static inline void synchronize_rcu_bh_expedited(void)
82{
83 synchronize_sched_expedited();
291} 84}
292 85
293extern void __rcu_init(void); 86extern void __rcu_init(void);
@@ -296,6 +89,11 @@ extern void rcu_restart_cpu(int cpu);
296 89
297extern long rcu_batches_completed(void); 90extern long rcu_batches_completed(void);
298extern long rcu_batches_completed_bh(void); 91extern long rcu_batches_completed_bh(void);
92extern long rcu_batches_completed_sched(void);
93
94static inline void rcu_init_sched(void)
95{
96}
299 97
300#ifdef CONFIG_NO_HZ 98#ifdef CONFIG_NO_HZ
301void rcu_enter_nohz(void); 99void rcu_enter_nohz(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f1ea4a66957..bd2675efd68d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1163,6 +1163,8 @@ struct sched_rt_entity {
1163#endif 1163#endif
1164}; 1164};
1165 1165
1166struct rcu_node;
1167
1166struct task_struct { 1168struct task_struct {
1167 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1169 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1168 void *stack; 1170 void *stack;
@@ -1206,10 +1208,12 @@ struct task_struct {
1206 unsigned int policy; 1208 unsigned int policy;
1207 cpumask_t cpus_allowed; 1209 cpumask_t cpus_allowed;
1208 1210
1209#ifdef CONFIG_PREEMPT_RCU 1211#ifdef CONFIG_TREE_PREEMPT_RCU
1210 int rcu_read_lock_nesting; 1212 int rcu_read_lock_nesting;
1211 int rcu_flipctr_idx; 1213 char rcu_read_unlock_special;
1212#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1214 struct rcu_node *rcu_blocked_node;
1215 struct list_head rcu_node_entry;
1216#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1213 1217
1214#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1218#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1215 struct sched_info sched_info; 1219 struct sched_info sched_info;
@@ -1724,6 +1728,28 @@ extern cputime_t task_gtime(struct task_struct *p);
1724#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1728#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1725#define used_math() tsk_used_math(current) 1729#define used_math() tsk_used_math(current)
1726 1730
1731#ifdef CONFIG_TREE_PREEMPT_RCU
1732
1733#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1734#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
1735#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */
1736
1737static inline void rcu_copy_process(struct task_struct *p)
1738{
1739 p->rcu_read_lock_nesting = 0;
1740 p->rcu_read_unlock_special = 0;
1741 p->rcu_blocked_node = NULL;
1742 INIT_LIST_HEAD(&p->rcu_node_entry);
1743}
1744
1745#else
1746
1747static inline void rcu_copy_process(struct task_struct *p)
1748{
1749}
1750
1751#endif
1752
1727#ifdef CONFIG_SMP 1753#ifdef CONFIG_SMP
1728extern int set_cpus_allowed_ptr(struct task_struct *p, 1754extern int set_cpus_allowed_ptr(struct task_struct *p,
1729 const struct cpumask *new_mask); 1755 const struct cpumask *new_mask);
diff --git a/init/Kconfig b/init/Kconfig
index 3f7e60995c80..8e8b76d8a272 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,38 +316,28 @@ choice
316 prompt "RCU Implementation" 316 prompt "RCU Implementation"
317 default TREE_RCU 317 default TREE_RCU
318 318
319config CLASSIC_RCU
320 bool "Classic RCU"
321 help
322 This option selects the classic RCU implementation that is
323 designed for best read-side performance on non-realtime
324 systems.
325
326 Select this option if you are unsure.
327
328config TREE_RCU 319config TREE_RCU
329 bool "Tree-based hierarchical RCU" 320 bool "Tree-based hierarchical RCU"
330 help 321 help
331 This option selects the RCU implementation that is 322 This option selects the RCU implementation that is
332 designed for very large SMP system with hundreds or 323 designed for very large SMP system with hundreds or
333 thousands of CPUs. 324 thousands of CPUs. It also scales down nicely to
325 smaller systems.
334 326
335config PREEMPT_RCU 327config TREE_PREEMPT_RCU
336 bool "Preemptible RCU" 328 bool "Preemptable tree-based hierarchical RCU"
337 depends on PREEMPT 329 depends on PREEMPT
338 help 330 help
339 This option reduces the latency of the kernel by making certain 331 This option selects the RCU implementation that is
340 RCU sections preemptible. Normally RCU code is non-preemptible, if 332 designed for very large SMP systems with hundreds or
341 this option is selected then read-only RCU sections become 333 thousands of CPUs, but for which real-time response
342 preemptible. This helps latency, but may expose bugs due to 334 is also required.
343 now-naive assumptions about each RCU read-side critical section
344 remaining on a given CPU through its execution.
345 335
346endchoice 336endchoice
347 337
348config RCU_TRACE 338config RCU_TRACE
349 bool "Enable tracing for RCU" 339 bool "Enable tracing for RCU"
350 depends on TREE_RCU || PREEMPT_RCU 340 depends on TREE_RCU || TREE_PREEMPT_RCU
351 help 341 help
352 This option provides tracing in RCU which presents stats 342 This option provides tracing in RCU which presents stats
353 in debugfs for debugging RCU implementation. 343 in debugfs for debugging RCU implementation.
@@ -359,7 +349,7 @@ config RCU_FANOUT
359 int "Tree-based hierarchical RCU fanout value" 349 int "Tree-based hierarchical RCU fanout value"
360 range 2 64 if 64BIT 350 range 2 64 if 64BIT
361 range 2 32 if !64BIT 351 range 2 32 if !64BIT
362 depends on TREE_RCU 352 depends on TREE_RCU || TREE_PREEMPT_RCU
363 default 64 if 64BIT 353 default 64 if 64BIT
364 default 32 if !64BIT 354 default 32 if !64BIT
365 help 355 help
@@ -374,7 +364,7 @@ config RCU_FANOUT
374 364
375config RCU_FANOUT_EXACT 365config RCU_FANOUT_EXACT
376 bool "Disable tree-based hierarchical RCU auto-balancing" 366 bool "Disable tree-based hierarchical RCU auto-balancing"
377 depends on TREE_RCU 367 depends on TREE_RCU || TREE_PREEMPT_RCU
378 default n 368 default n
379 help 369 help
380 This option forces use of the exact RCU_FANOUT value specified, 370 This option forces use of the exact RCU_FANOUT value specified,
@@ -387,18 +377,12 @@ config RCU_FANOUT_EXACT
387 Say N if unsure. 377 Say N if unsure.
388 378
389config TREE_RCU_TRACE 379config TREE_RCU_TRACE
390 def_bool RCU_TRACE && TREE_RCU 380 def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
391 select DEBUG_FS
392 help
393 This option provides tracing for the TREE_RCU implementation,
394 permitting Makefile to trivially select kernel/rcutree_trace.c.
395
396config PREEMPT_RCU_TRACE
397 def_bool RCU_TRACE && PREEMPT_RCU
398 select DEBUG_FS 381 select DEBUG_FS
399 help 382 help
400 This option provides tracing for the PREEMPT_RCU implementation, 383 This option provides tracing for the TREE_RCU and
401 permitting Makefile to trivially select kernel/rcupreempt_trace.c. 384 TREE_PREEMPT_RCU implementations, permitting Makefile to
385 trivially select kernel/rcutree_trace.c.
402 386
403endmenu # "RCU Subsystem" 387endmenu # "RCU Subsystem"
404 388
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..b833bd5cc127 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..263f95ed7201 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1010,6 +1010,7 @@ NORET_TYPE void do_exit(long code)
1010 __free_pipe_info(tsk->splice_pipe); 1010 __free_pipe_info(tsk->splice_pipe);
1011 1011
1012 preempt_disable(); 1012 preempt_disable();
1013 exit_rcu();
1013 /* causes final put_task_struct in finish_task_switch(). */ 1014 /* causes final put_task_struct in finish_task_switch(). */
1014 tsk->state = TASK_DEAD; 1015 tsk->state = TASK_DEAD;
1015 schedule(); 1016 schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index e6c04d462ab2..637520ca0386 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1008,10 +1008,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1008 copy_flags(clone_flags, p); 1008 copy_flags(clone_flags, p);
1009 INIT_LIST_HEAD(&p->children); 1009 INIT_LIST_HEAD(&p->children);
1010 INIT_LIST_HEAD(&p->sibling); 1010 INIT_LIST_HEAD(&p->sibling);
1011#ifdef CONFIG_PREEMPT_RCU 1011 rcu_copy_process(p);
1012 p->rcu_read_lock_nesting = 0;
1013 p->rcu_flipctr_idx = 0;
1014#endif /* #ifdef CONFIG_PREEMPT_RCU */
1015 p->vfork_done = NULL; 1012 p->vfork_done = NULL;
1016 spin_lock_init(&p->alloc_lock); 1013 spin_lock_init(&p->alloc_lock);
1017 1014
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..bd5d5c8e5140 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
98} 98}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 99EXPORT_SYMBOL_GPL(synchronize_rcu);
100 100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
101static void rcu_barrier_callback(struct rcu_head *notused) 125static void rcu_barrier_callback(struct rcu_head *notused)
102{ 126{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 153static inline void wait_migrated_callbacks(void)
130{ 154{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
132} 157}
133 158
134/* 159/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 217 wake_up(&rcu_migrate_wq);
193} 218}
194 219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 224 unsigned long action, void *hcpu)
197{ 225{
226 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 227 if (action == CPU_DYING) {
199 /* 228 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 244 wait_migrated_callbacks();
215 } 245 }
@@ -219,8 +249,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 249
220void __init rcu_init(void) 250void __init rcu_init(void)
221{ 251{
252 int i;
253
222 __rcu_init(); 254 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 264}
225 265
226void rcu_scheduler_starting(void) 266void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 320 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 321 rcu_torture_free(rp);
322 } else 322 } else
323 cur_ops->deferredfree(rp); 323 cur_ops->deferred_free(rp);
324} 324}
325 325
326static void rcu_torture_deferred_free(struct rcu_torture *p) 326static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 329}
330 330
331static struct rcu_torture_ops rcu_ops = { 331static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 332 .init = NULL,
333 .cleanup = NULL, 333 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 334 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 335 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 336 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 337 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 338 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 339 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 340 .cb_barrier = rcu_barrier,
341 .stats = NULL, 341 .stats = NULL,
342 .irqcapable = 1, 342 .irq_capable = 1,
343 .name = "rcu" 343 .name = "rcu"
344}; 344};
345 345
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 346static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
370} 370}
371 371
372static struct rcu_torture_ops rcu_sync_ops = { 372static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 373 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 374 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 375 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 376 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 377 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 378 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 379 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 380 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 381 .cb_barrier = NULL,
382 .stats = NULL, 382 .stats = NULL,
383 .irqcapable = 1, 383 .irq_capable = 1,
384 .name = "rcu_sync" 384 .name = "rcu_sync"
385}; 385};
386 386
387/* 387/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
432} 432}
433 433
434static struct rcu_torture_ops rcu_bh_ops = { 434static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 435 .init = NULL,
436 .cleanup = NULL, 436 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 437 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 438 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 439 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 440 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 441 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 442 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 443 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 444 .stats = NULL,
445 .irqcapable = 1, 445 .irq_capable = 1,
446 .name = "rcu_bh" 446 .name = "rcu_bh"
447}; 447};
448 448
449static struct rcu_torture_ops rcu_bh_sync_ops = { 449static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 450 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 451 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 452 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 453 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 454 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 455 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 456 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 457 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 458 .cb_barrier = NULL,
459 .stats = NULL, 459 .stats = NULL,
460 .irqcapable = 1, 460 .irq_capable = 1,
461 .name = "rcu_bh_sync" 461 .name = "rcu_bh_sync"
462}; 462};
463 463
464/* 464/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
530} 530}
531 531
532static struct rcu_torture_ops srcu_ops = { 532static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 533 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 534 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 535 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 536 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 537 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 538 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 539 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 540 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 541 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 542 .stats = srcu_torture_stats,
543 .name = "srcu" 543 .name = "srcu"
544}; 544};
545 545
546/* 546/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
574} 574}
575 575
576static struct rcu_torture_ops sched_ops = { 576static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 577 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 578 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 579 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 581 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 582 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 583 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 584 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 585 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 586 .stats = NULL,
587 .irqcapable = 1, 587 .irq_capable = 1,
588 .name = "sched" 588 .name = "sched"
589}; 589};
590 590
591static struct rcu_torture_ops sched_ops_sync = { 591static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 592 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 593 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 594 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 596 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 597 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 599 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 600 .cb_barrier = NULL,
601 .stats = NULL, 601 .stats = NULL,
602 .name = "sched_sync" 602 .name = "sched_sync"
603};
604
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init,
609 .cleanup = NULL,
610 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed,
614 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL,
617 .stats = rcu_expedited_torture_stats,
618 .irq_capable = 1,
619 .name = "sched_expedited"
603}; 620};
604 621
605/* 622/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 652 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 653 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 654 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 655 cur_ops->deferred_free(old_rp);
639 } 656 }
640 rcu_torture_current_version++; 657 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 658 oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 717 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 718 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 719 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 720 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 721 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 722 spin_unlock(&rand_lock);
706 preempt_disable(); 723 preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
738 755
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 756 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 757 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 759 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 760
744 do { 761 do {
745 if (irqreader && cur_ops->irqcapable) { 762 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 763 if (!timer_pending(&t))
747 mod_timer(&t, 1); 764 mod_timer(&t, 1);
748 } 765 }
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
757 } 774 }
758 if (p->rtort_mbtest == 0) 775 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 776 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 777 cur_ops->read_delay(&rand);
761 preempt_disable(); 778 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 779 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 780 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 795 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 796 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 797 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 798 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 799 del_timer_sync(&t);
783 while (!kthread_should_stop()) 800 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 801 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1095 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1096 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1098 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1099 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1100
1083 mutex_lock(&fullstop_mutex); 1101 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..71bc79791cd9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,8 @@
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/time.h> 47#include <linux/time.h>
48 48
49#include "rcutree.h"
50
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 51#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 52static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 53struct lockdep_map rcu_lock_map =
@@ -72,30 +74,59 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
73} 75}
74 76
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 78DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 79
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 82
83extern long rcu_batches_completed_sched(void);
84static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
85static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
86 struct rcu_node *rnp, unsigned long flags);
87static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
88#ifdef CONFIG_HOTPLUG_CPU
89static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
90#endif /* #ifdef CONFIG_HOTPLUG_CPU */
91static void __rcu_process_callbacks(struct rcu_state *rsp,
92 struct rcu_data *rdp);
93static void __call_rcu(struct rcu_head *head,
94 void (*func)(struct rcu_head *rcu),
95 struct rcu_state *rsp);
96static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
97static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
98 int preemptable);
99
100#include "rcutree_plugin.h"
101
81/* 102/*
82 * Increment the quiescent state counter. 103 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 104 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 105 * one since the start of the grace period, this just sets a flag.
86 */ 106 */
87void rcu_qsctr_inc(int cpu) 107void rcu_sched_qs(int cpu)
88{ 108{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 109 unsigned long flags;
110 struct rcu_data *rdp;
111
112 local_irq_save(flags);
113 rdp = &per_cpu(rcu_sched_data, cpu);
90 rdp->passed_quiesc = 1; 114 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed; 115 rdp->passed_quiesc_completed = rdp->completed;
116 rcu_preempt_qs(cpu);
117 local_irq_restore(flags);
92} 118}
93 119
94void rcu_bh_qsctr_inc(int cpu) 120void rcu_bh_qs(int cpu)
95{ 121{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 122 unsigned long flags;
123 struct rcu_data *rdp;
124
125 local_irq_save(flags);
126 rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1; 127 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed; 128 rdp->passed_quiesc_completed = rdp->completed;
129 local_irq_restore(flags);
99} 130}
100 131
101#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
@@ -110,15 +141,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 141static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 142
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 143static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
144static int rcu_pending(int cpu);
113 145
114/* 146/*
115 * Return the number of RCU batches processed thus far for debug & stats. 147 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 148 */
117long rcu_batches_completed(void) 149long rcu_batches_completed_sched(void)
118{ 150{
119 return rcu_state.completed; 151 return rcu_sched_state.completed;
120} 152}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 154
123/* 155/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 156 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +213,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 213 return 1;
182 } 214 }
183 215
216 /* If preemptable RCU, no point in sending reschedule IPI. */
217 if (rdp->preemptable)
218 return 0;
219
184 /* The CPU is online, so send it a reschedule IPI. */ 220 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 221 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 222 smp_send_reschedule(rdp->cpu);
@@ -193,7 +229,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 229#endif /* #ifdef CONFIG_SMP */
194 230
195#ifdef CONFIG_NO_HZ 231#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 232
198/** 233/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 234 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +248,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 248 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 249 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 250 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 251 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 252 local_irq_restore(flags);
218} 253}
219 254
@@ -232,7 +267,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 267 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 268 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 269 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 270 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 271 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 272 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 273}
@@ -251,7 +286,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 286 if (rdtp->dynticks & 0x1)
252 return; 287 return;
253 rdtp->dynticks_nmi++; 288 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 289 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 291}
257 292
@@ -270,7 +305,7 @@ void rcu_nmi_exit(void)
270 return; 305 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 307 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 308 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 309}
275 310
276/** 311/**
@@ -286,7 +321,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 321 if (rdtp->dynticks_nesting++)
287 return; 322 return;
288 rdtp->dynticks++; 323 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 324 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 325 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 326}
292 327
@@ -305,10 +340,10 @@ void rcu_irq_exit(void)
305 return; 340 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 341 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 342 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 343 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 344
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 345 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 346 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 347 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 348 set_need_resched();
314} 349}
@@ -461,6 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 496
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 497 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 498 for (; rnp_cur < rnp_end; rnp_cur++) {
499 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 500 if (rnp_cur->qsmask == 0)
465 continue; 501 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 502 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -674,6 +710,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 710}
675 711
676/* 712/*
713 * Clean up after the prior grace period and let rcu_start_gp() start up
714 * the next grace period if one is needed. Note that the caller must
715 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
716 */
717static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
718 __releases(rnp->lock)
719{
720 rsp->completed = rsp->gpnum;
721 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
722 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
723}
724
725/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 726 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 727 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 728 * group must be represented by the same leaf rcu_node structure.
@@ -694,7 +743,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 743 return;
695 } 744 }
696 rnp->qsmask &= ~mask; 745 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 746 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 747
699 /* Other bits still set at this level, so done. */ 748 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 749 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -714,14 +763,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
714 763
715 /* 764 /*
716 * Get here if we are the last CPU to pass through a quiescent 765 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 766 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 767 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 768 */
722 rsp->completed = rsp->gpnum; 769 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 770}
726 771
727/* 772/*
@@ -828,11 +873,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 873 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 874 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 875 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 876 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 877 break;
833 } 878 }
879 rcu_preempt_offline_tasks(rsp, rnp);
834 mask = rnp->grpmask; 880 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 881 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 882 rnp = rnp->parent;
837 } while (rnp != NULL); 883 } while (rnp != NULL);
838 lastcomp = rsp->completed; 884 lastcomp = rsp->completed;
@@ -845,7 +891,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
845 /* 891 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 892 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 893 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 894 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 895 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 896 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 897 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +922,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 922 */
877static void rcu_offline_cpu(int cpu) 923static void rcu_offline_cpu(int cpu)
878{ 924{
879 __rcu_offline_cpu(cpu, &rcu_state); 925 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 926 __rcu_offline_cpu(cpu, &rcu_bh_state);
927 rcu_preempt_offline_cpu(cpu);
881} 928}
882 929
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 930#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +1010,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 1010 */
964void rcu_check_callbacks(int cpu, int user) 1011void rcu_check_callbacks(int cpu, int user)
965{ 1012{
1013 if (!rcu_pending(cpu))
1014 return; /* if nothing for RCU to do. */
966 if (user || 1015 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1016 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1017 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1020,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1020 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1021 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1022 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1023 * a quiescent state, so note it.
975 * 1024 *
976 * No memory barrier is required here because both 1025 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1026 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1027 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1028 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1029 */
982 1030
983 rcu_qsctr_inc(cpu); 1031 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1032 rcu_bh_qs(cpu);
985 1033
986 } else if (!in_softirq()) { 1034 } else if (!in_softirq()) {
987 1035
@@ -989,11 +1037,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1037 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1038 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1039 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1040 * critical section, so note it.
993 */ 1041 */
994 1042
995 rcu_bh_qsctr_inc(cpu); 1043 rcu_bh_qs(cpu);
996 } 1044 }
1045 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1046 raise_softirq(RCU_SOFTIRQ);
998} 1047}
999 1048
@@ -1132,6 +1181,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1181{
1133 unsigned long flags; 1182 unsigned long flags;
1134 1183
1184 WARN_ON_ONCE(rdp->beenonline == 0);
1185
1135 /* 1186 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1187 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1188 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1221,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1221 */
1171 smp_mb(); /* See above block comment. */ 1222 smp_mb(); /* See above block comment. */
1172 1223
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1224 __rcu_process_callbacks(&rcu_sched_state,
1225 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1226 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1227 rcu_preempt_process_callbacks();
1175 1228
1176 /* 1229 /*
1177 * Memory references from any later RCU read-side critical sections 1230 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1280,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1280}
1228 1281
1229/* 1282/*
1230 * Queue an RCU callback for invocation after a grace period. 1283 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1284 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1285void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1286{
1234 __call_rcu(head, func, &rcu_state); 1287 __call_rcu(head, func, &rcu_sched_state);
1235} 1288}
1236EXPORT_SYMBOL_GPL(call_rcu); 1289EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1290
1238/* 1291/*
1239 * Queue an RCU for invocation after a quicker grace period. 1292 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1358,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1358 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1359 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1360 */
1308int rcu_pending(int cpu) 1361static int rcu_pending(int cpu)
1309{ 1362{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1363 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1364 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1365 rcu_preempt_pending(cpu);
1312} 1366}
1313 1367
1314/* 1368/*
@@ -1320,27 +1374,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1374int rcu_needs_cpu(int cpu)
1321{ 1375{
1322 /* RCU callbacks either ready or pending? */ 1376 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1377 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1378 per_cpu(rcu_bh_data, cpu).nxtlist ||
1379 rcu_preempt_needs_cpu(cpu);
1325} 1380}
1326 1381
1327/* 1382/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1383 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1384 */
1339static void __cpuinit 1385static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1386rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1387{
1342 unsigned long flags; 1388 unsigned long flags;
1343 int i; 1389 int i;
1390 struct rcu_data *rdp = rsp->rda[cpu];
1391 struct rcu_node *rnp = rcu_get_root(rsp);
1392
1393 /* Set up local state, ensuring consistent view of global state. */
1394 spin_lock_irqsave(&rnp->lock, flags);
1395 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1396 rdp->nxtlist = NULL;
1397 for (i = 0; i < RCU_NEXT_SIZE; i++)
1398 rdp->nxttail[i] = &rdp->nxtlist;
1399 rdp->qlen = 0;
1400#ifdef CONFIG_NO_HZ
1401 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1402#endif /* #ifdef CONFIG_NO_HZ */
1403 rdp->cpu = cpu;
1404 spin_unlock_irqrestore(&rnp->lock, flags);
1405}
1406
1407/*
1408 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1409 * offline event can be happening at a given time. Note also that we
1410 * can accept some slop in the rsp->completed access due to the fact
1411 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1412 */
1413static void __cpuinit
1414rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1415{
1416 unsigned long flags;
1344 long lastcomp; 1417 long lastcomp;
1345 unsigned long mask; 1418 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1419 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1427,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1427 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1428 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1429 rdp->beenonline = 1; /* We have now been online. */
1430 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1431 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1432 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1433 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1434
1370 /* 1435 /*
@@ -1405,16 +1470,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1405 1470
1406static void __cpuinit rcu_online_cpu(int cpu) 1471static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1472{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1473 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1474 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1475 rcu_preempt_init_percpu_data(cpu);
1411} 1476}
1412 1477
1413/* 1478/*
1414 * Handle CPU online/offline notifcation events. 1479 * Handle CPU online/offline notification events.
1415 */ 1480 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1481int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1482 unsigned long action, void *hcpu)
1418{ 1483{
1419 long cpu = (long)hcpu; 1484 long cpu = (long)hcpu;
1420 1485
@@ -1486,6 +1551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1551 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1552 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1553 spin_lock_init(&rnp->lock);
1554 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1555 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1556 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1557 rnp->grplo = j * cpustride;
@@ -1503,16 +1569,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1569 j / rsp->levelspread[i - 1];
1504 } 1570 }
1505 rnp->level = i; 1571 rnp->level = i;
1572 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1573 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1574 }
1507 } 1575 }
1508} 1576}
1509 1577
1510/* 1578/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1579 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1580 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1581 * structure.
1513 */ 1582 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1583#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1584do { \
1585 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1586 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1587 j = 0; \
1518 for_each_possible_cpu(i) { \ 1588 for_each_possible_cpu(i) { \
@@ -1520,32 +1590,43 @@ do { \
1520 j++; \ 1590 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1591 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1592 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1593 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1594 } \
1524} while (0) 1595} while (0)
1525 1596
1526static struct notifier_block __cpuinitdata rcu_nb = { 1597#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1598
1528}; 1599void __init __rcu_init_preempt(void)
1600{
1601 int i; /* All used by RCU_INIT_FLAVOR(). */
1602 int j;
1603 struct rcu_node *rnp;
1604
1605 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1606}
1607
1608#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1609
1610void __init __rcu_init_preempt(void)
1611{
1612}
1613
1614#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1615
1530void __init __rcu_init(void) 1616void __init __rcu_init(void)
1531{ 1617{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1618 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1619 int j;
1534 struct rcu_node *rnp; 1620 struct rcu_node *rnp;
1535 1621
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1622 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1623#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1624 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1625#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1626 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1627 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1628 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1629 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1630}
1550 1631
1551module_param(blimit, int, 0); 1632module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..bf8a6f9f134d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..47789369ea59
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs_record(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72}
73
74/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd.
76 * If we are in an RCU read-side critical section, we need to reflect
77 * that in the state of the rcu_node structure corresponding to this CPU.
78 * Caller must disable hardirqs.
79 */
80static void rcu_preempt_qs(int cpu)
81{
82 struct task_struct *t = current;
83 int phase;
84 struct rcu_data *rdp;
85 struct rcu_node *rnp;
86
87 if (t->rcu_read_lock_nesting &&
88 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
89
90 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode;
93 spin_lock(&rnp->lock);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp;
96
97 /*
98 * If this CPU has already checked in, then this task
99 * will hold up the next grace period rather than the
100 * current grace period. Queue the task accordingly.
101 * If the task is queued for the current grace period
102 * (i.e., this CPU has not yet passed through a quiescent
103 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period
105 * cannot end.
106 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */
110 spin_unlock(&rnp->lock);
111 }
112
113 /*
114 * Either we were not in an RCU read-side critical section to
115 * begin with, or we have now recorded that critical section
116 * globally. Either way, we can now note a quiescent state
117 * for this CPU. Again, if we were in an RCU read-side critical
118 * section, and if that critical section was blocking the current
119 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period.
121 */
122 rcu_preempt_qs_record(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124 RCU_READ_UNLOCK_GOT_QS);
125}
126
127/*
128 * Tree-preemptable RCU implementation for rcu_read_lock().
129 * Just increment ->rcu_read_lock_nesting, shared state will be updated
130 * if we block.
131 */
132void __rcu_read_lock(void)
133{
134 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
135 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
136}
137EXPORT_SYMBOL_GPL(__rcu_read_lock);
138
139static void rcu_read_unlock_special(struct task_struct *t)
140{
141 int empty;
142 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp;
145 int special;
146
147 /* NMI handlers cannot block and cannot safely manipulate state. */
148 if (in_nmi())
149 return;
150
151 local_irq_save(flags);
152
153 /*
154 * If RCU core is waiting for this CPU to exit critical section,
155 * let it know that we have done so.
156 */
157 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
161 }
162
163 /* Hardware IRQ handlers cannot block. */
164 if (in_irq()) {
165 local_irq_restore(flags);
166 return;
167 }
168
169 /* Clean up if blocked during RCU read-side critical section. */
170 if (special & RCU_READ_UNLOCK_BLOCKED) {
171 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
172
173 /*
174 * Remove this task from the list it blocked on. The
175 * task can migrate while we acquire the lock, but at
176 * most one time. So at most two passes through loop.
177 */
178 for (;;) {
179 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock);
181 if (rnp == t->rcu_blocked_node)
182 break;
183 spin_unlock(&rnp->lock);
184 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL;
188
189 /*
190 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
193 * drop rnp->lock and restore irq.
194 */
195 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
211 return;
212 }
213 spin_unlock(&rnp->lock);
214 }
215 local_irq_restore(flags);
216}
217
218/*
219 * Tree-preemptable RCU implementation for rcu_read_unlock().
220 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
221 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
222 * invoke rcu_read_unlock_special() to clean up after a context switch
223 * in an RCU read-side critical section and other special cases.
224 */
225void __rcu_read_unlock(void)
226{
227 struct task_struct *t = current;
228
229 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
230 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
231 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
232 rcu_read_unlock_special(t);
233}
234EXPORT_SYMBOL_GPL(__rcu_read_unlock);
235
236#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
237
238/*
239 * Scan the current list of tasks blocked within RCU read-side critical
240 * sections, printing out the tid of each.
241 */
242static void rcu_print_task_stall(struct rcu_node *rnp)
243{
244 unsigned long flags;
245 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1;
247 struct task_struct *t;
248
249 if (!list_empty(&rnp->blocked_tasks[phase])) {
250 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */
252 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid);
255 spin_unlock_irqrestore(&rnp->lock, flags);
256 }
257}
258
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260
261/*
262 * Check for preempted RCU readers for the specified rcu_node structure.
263 * If the caller needs a reliable answer, it must hold the rcu_node's
264 * >lock.
265 */
266static int rcu_preempted_readers(struct rcu_node *rnp)
267{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
269}
270
271#ifdef CONFIG_HOTPLUG_CPU
272
273/*
274 * Handle tasklist migration for case in which all CPUs covered by the
275 * specified rcu_node have gone offline. Move them up to the root
276 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock.
279 *
280 * The caller must hold rnp->lock with irqs disabled.
281 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp)
284{
285 int i;
286 struct list_head *lp;
287 struct list_head *lp_root;
288 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp;
290
291 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */
294 }
295
296 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the
298 * root rcu_node can be at most one ahead of the rest of the
299 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element.
301 */
302 for (i = 0; i < 2; i++) {
303 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) {
306 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
307 spin_lock(&rnp_root->lock); /* irqs already disabled */
308 list_del(&tp->rcu_node_entry);
309 tp->rcu_blocked_node = rnp_root;
310 list_add(&tp->rcu_node_entry, lp_root);
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 }
313 }
314}
315
316/*
317 * Do CPU-offline processing for preemptable RCU.
318 */
319static void rcu_preempt_offline_cpu(int cpu)
320{
321 __rcu_offline_cpu(cpu, &rcu_preempt_state);
322}
323
324#endif /* #ifdef CONFIG_HOTPLUG_CPU */
325
326/*
327 * Check for a quiescent state from the current CPU. When a task blocks,
328 * the task is recorded in the corresponding CPU's rcu_node structure,
329 * which is checked elsewhere.
330 *
331 * Caller must disable hard irqs.
332 */
333static void rcu_preempt_check_callbacks(int cpu)
334{
335 struct task_struct *t = current;
336
337 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &=
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340 rcu_preempt_qs_record(cpu);
341 return;
342 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352}
353
354/*
355 * Process callbacks for preemptable RCU.
356 */
357static void rcu_preempt_process_callbacks(void)
358{
359 __rcu_process_callbacks(&rcu_preempt_state,
360 &__get_cpu_var(rcu_preempt_data));
361}
362
363/*
364 * Queue a preemptable-RCU callback for invocation after a grace period.
365 */
366void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
367{
368 __call_rcu(head, func, &rcu_preempt_state);
369}
370EXPORT_SYMBOL_GPL(call_rcu);
371
372/*
373 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done.
375 */
376static int rcu_preempt_pending(int cpu)
377{
378 return __rcu_pending(&rcu_preempt_state,
379 &per_cpu(rcu_preempt_data, cpu));
380}
381
382/*
383 * Does preemptable RCU need the CPU to stay out of dynticks mode?
384 */
385static int rcu_preempt_needs_cpu(int cpu)
386{
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388}
389
390/*
391 * Initialize preemptable RCU's per-CPU data.
392 */
393static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
394{
395 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
396}
397
398/*
399 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep
402 * is enabled.
403 */
404void exit_rcu(void)
405{
406 struct task_struct *t = current;
407
408 if (t->rcu_read_lock_nesting == 0)
409 return;
410 t->rcu_read_lock_nesting = 1;
411 rcu_read_unlock();
412}
413
414#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
415
416/*
417 * Tell them what RCU they are running.
418 */
419static inline void rcu_bootup_announce(void)
420{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422}
423
424/*
425 * Return the number of RCU batches processed thus far for debug & stats.
426 */
427long rcu_batches_completed(void)
428{
429 return rcu_batches_completed_sched();
430}
431EXPORT_SYMBOL_GPL(rcu_batches_completed);
432
433/*
434 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states.
436 */
437static void rcu_preempt_qs(int cpu)
438{
439}
440
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442
443/*
444 * Because preemptable RCU does not exist, we never have to check for
445 * tasks blocked within RCU read-side critical sections.
446 */
447static void rcu_print_task_stall(struct rcu_node *rnp)
448{
449}
450
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452
453/*
454 * Because preemptable RCU does not exist, there are never any preempted
455 * RCU readers.
456 */
457static int rcu_preempted_readers(struct rcu_node *rnp)
458{
459 return 0;
460}
461
462#ifdef CONFIG_HOTPLUG_CPU
463
464/*
465 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections.
467 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp)
470{
471}
472
473/*
474 * Because preemptable RCU does not exist, it never needs CPU-offline
475 * processing.
476 */
477static void rcu_preempt_offline_cpu(int cpu)
478{
479}
480
481#endif /* #ifdef CONFIG_HOTPLUG_CPU */
482
483/*
484 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check.
486 */
487void rcu_preempt_check_callbacks(int cpu)
488{
489}
490
491/*
492 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process.
494 */
495void rcu_preempt_process_callbacks(void)
496{
497}
498
499/*
500 * In classic RCU, call_rcu() is just call_rcu_sched().
501 */
502void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
503{
504 call_rcu_sched(head, func);
505}
506EXPORT_SYMBOL_GPL(call_rcu);
507
508/*
509 * Because preemptable RCU does not exist, it never has any work to do.
510 */
511static int rcu_preempt_pending(int cpu)
512{
513 return 0;
514}
515
516/*
517 * Because preemptable RCU does not exist, it never needs any CPU.
518 */
519static int rcu_preempt_needs_cpu(int cpu)
520{
521 return 0;
522}
523
524/*
525 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize.
527 */
528static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{
530}
531
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..0ea1bff69727 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..c9beca67a53e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5325,7 +5325,7 @@ need_resched:
5325 preempt_disable(); 5325 preempt_disable();
5326 cpu = smp_processor_id(); 5326 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5327 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5328 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5329 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5330 switch_count = &prev->nivcsw;
5331 5331
@@ -7051,6 +7051,11 @@ fail:
7051 return ret; 7051 return ret;
7052} 7052}
7053 7053
7054#define RCU_MIGRATION_IDLE 0
7055#define RCU_MIGRATION_NEED_QS 1
7056#define RCU_MIGRATION_GOT_QS 2
7057#define RCU_MIGRATION_MUST_SYNC 3
7058
7054/* 7059/*
7055 * migration_thread - this is a highprio system thread that performs 7060 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7061 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7063,7 @@ fail:
7058 */ 7063 */
7059static int migration_thread(void *data) 7064static int migration_thread(void *data)
7060{ 7065{
7066 int badcpu;
7061 int cpu = (long)data; 7067 int cpu = (long)data;
7062 struct rq *rq; 7068 struct rq *rq;
7063 7069
@@ -7092,8 +7098,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7098 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7099 list_del_init(head->next);
7094 7100
7095 spin_unlock(&rq->lock); 7101 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7102 spin_unlock(&rq->lock);
7103 __migrate_task(req->task, cpu, req->dest_cpu);
7104 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7105 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7106 spin_unlock(&rq->lock);
7107 } else {
7108 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7109 spin_unlock(&rq->lock);
7110 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7111 }
7097 local_irq_enable(); 7112 local_irq_enable();
7098 7113
7099 complete(&req->done); 7114 complete(&req->done);
@@ -10581,3 +10596,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10596 .subsys_id = cpuacct_subsys_id,
10582}; 10597};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10598#endif /* CONFIG_CGROUP_CPUACCT */
10599
10600#ifndef CONFIG_SMP
10601
10602int rcu_expedited_torture_stats(char *page)
10603{
10604 return 0;
10605}
10606EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10607
10608void synchronize_sched_expedited(void)
10609{
10610}
10611EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10612
10613#else /* #ifndef CONFIG_SMP */
10614
10615static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10616static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10617
10618#define RCU_EXPEDITED_STATE_POST -2
10619#define RCU_EXPEDITED_STATE_IDLE -1
10620
10621static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10622
10623int rcu_expedited_torture_stats(char *page)
10624{
10625 int cnt = 0;
10626 int cpu;
10627
10628 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10629 for_each_online_cpu(cpu) {
10630 cnt += sprintf(&page[cnt], " %d:%d",
10631 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10632 }
10633 cnt += sprintf(&page[cnt], "\n");
10634 return cnt;
10635}
10636EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10637
10638static long synchronize_sched_expedited_count;
10639
10640/*
10641 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10642 * approach to force grace period to end quickly. This consumes
10643 * significant time on all CPUs, and is thus not recommended for
10644 * any sort of common-case code.
10645 *
10646 * Note that it is illegal to call this function while holding any
10647 * lock that is acquired by a CPU-hotplug notifier. Failing to
10648 * observe this restriction will result in deadlock.
10649 */
10650void synchronize_sched_expedited(void)
10651{
10652 int cpu;
10653 unsigned long flags;
10654 bool need_full_sync = 0;
10655 struct rq *rq;
10656 struct migration_req *req;
10657 long snap;
10658 int trycount = 0;
10659
10660 smp_mb(); /* ensure prior mod happens before capturing snap. */
10661 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10662 get_online_cpus();
10663 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10664 put_online_cpus();
10665 if (trycount++ < 10)
10666 udelay(trycount * num_online_cpus());
10667 else {
10668 synchronize_sched();
10669 return;
10670 }
10671 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10672 smp_mb(); /* ensure test happens before caller kfree */
10673 return;
10674 }
10675 get_online_cpus();
10676 }
10677 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10678 for_each_online_cpu(cpu) {
10679 rq = cpu_rq(cpu);
10680 req = &per_cpu(rcu_migration_req, cpu);
10681 init_completion(&req->done);
10682 req->task = NULL;
10683 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10684 spin_lock_irqsave(&rq->lock, flags);
10685 list_add(&req->list, &rq->migration_queue);
10686 spin_unlock_irqrestore(&rq->lock, flags);
10687 wake_up_process(rq->migration_thread);
10688 }
10689 for_each_online_cpu(cpu) {
10690 rcu_expedited_state = cpu;
10691 req = &per_cpu(rcu_migration_req, cpu);
10692 rq = cpu_rq(cpu);
10693 wait_for_completion(&req->done);
10694 spin_lock_irqsave(&rq->lock, flags);
10695 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10696 need_full_sync = 1;
10697 req->dest_cpu = RCU_MIGRATION_IDLE;
10698 spin_unlock_irqrestore(&rq->lock, flags);
10699 }
10700 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10701 mutex_unlock(&rcu_sched_expedited_mutex);
10702 put_online_cpus();
10703 if (need_full_sync)
10704 synchronize_sched();
10705}
10706EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10707
10708#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..7db25067cd2d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..a3d25f415019 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1156,8 +1156,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1156 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1157 account_process_tick(p, user_tick);
1158 run_local_timers(); 1158 run_local_timers();
1159 if (rcu_pending(cpu)) 1159 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1160 printk_tick();
1162 scheduler_tick(); 1161 scheduler_tick();
1163 run_posix_cpu_timers(p); 1162 run_posix_cpu_timers(p);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..82fbc49728df 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -725,7 +725,7 @@ config RCU_TORTURE_TEST_RUNNABLE
725 725
726config RCU_CPU_STALL_DETECTOR 726config RCU_CPU_STALL_DETECTOR
727 bool "Check for stalled CPUs delaying RCU grace periods" 727 bool "Check for stalled CPUs delaying RCU grace periods"
728 depends on CLASSIC_RCU || TREE_RCU 728 depends on TREE_RCU || TREE_PREEMPT_RCU
729 default n 729 default n
730 help 730 help
731 This option causes RCU to printk information on which 731 This option causes RCU to printk information on which