aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2009-08-22 16:56:53 -0400
committerIngo Molnar <mingo@elte.hu>2009-08-23 04:32:40 -0400
commit6b3ef48adf847f7adf11c870e3ffacac150f1564 (patch)
treee1403ce515bf00ade99ec806f6ab6b6db999aa0b
parentf41d911f8c49a5d65c86504c19e8204bb605c4fd (diff)
rcu: Remove CONFIG_PREEMPT_RCU
Now that CONFIG_TREE_PREEMPT_RCU is in place, there is no further need for CONFIG_PREEMPT_RCU. Remove it, along with whatever subtle bugs it may (or may not) contain. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <125097461396-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/whatisRCU.txt8
-rw-r--r--include/linux/init_task.h6
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcupreempt.h140
-rw-r--r--include/linux/rcupreempt_trace.h97
-rw-r--r--include/linux/sched.h13
-rw-r--r--init/Kconfig20
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/rcupreempt.c1518
-rw-r--r--kernel/rcupreempt_trace.c335
-rw-r--r--lib/Kconfig.debug2
12 files changed, 13 insertions, 2142 deletions
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 7aa2002ade77..2a23523ce471 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed
36 executed in user mode, or executed in the idle loop, we can 36 executed in user mode, or executed in the idle loop, we can
37 safely free up that item. 37 safely free up that item.
38 38
39 Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking
42 within RCU read-side critical sections. SRCU also uses 42 within RCU read-side critical sections. SRCU also uses
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that?
79o I hear that RCU needs work in order to support realtime kernels? 79o I hear that RCU needs work in order to support realtime kernels?
80 80
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
83 However, work is in progress for enabling priority boosting of 83 parameter. However, work is in progress for enabling priority
84 preempted RCU read-side critical sections. This is needed if you 84 boosting of preempted RCU read-side critical sections. This is
85 have CPU-bound realtime threads. 85 needed if you have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
88 88
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 97ded2432c59..e41a7fecf0d3 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -136,10 +136,10 @@ rcu_read_lock()
136 Used by a reader to inform the reclaimer that the reader is 136 Used by a reader to inform the reclaimer that the reader is
137 entering an RCU read-side critical section. It is illegal 137 entering an RCU read-side critical section. It is illegal
138 to block while in an RCU read-side critical section, though 138 to block while in an RCU read-side critical section, though
139 kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side 139 kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
140 critical sections. Any RCU-protected data structure accessed 140 read-side critical sections. Any RCU-protected data structure
141 during an RCU read-side critical section is guaranteed to remain 141 accessed during an RCU read-side critical section is guaranteed to
142 unreclaimed for the full duration of that critical section. 142 remain unreclaimed for the full duration of that critical section.
143 Reference counts may be used in conjunction with RCU to maintain 143 Reference counts may be used in conjunction with RCU to maintain
144 longer-term references to data structures. 144 longer-term references to data structures.
145 145
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 971a968831bf..79d4baee31b6 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -94,11 +94,7 @@ extern struct group_info init_groups;
94# define CAP_INIT_BSET CAP_INIT_EFF_SET 94# define CAP_INIT_BSET CAP_INIT_EFF_SET
95#endif 95#endif
96 96
97#ifdef CONFIG_PREEMPT_RCU 97#ifdef CONFIG_TREE_PREEMPT_RCU
98#define INIT_TASK_RCU_PREEMPT(tsk) \
99 .rcu_read_lock_nesting = 0, \
100 .rcu_flipctr_idx = 0,
101#elif defined(CONFIG_TREE_PREEMPT_RCU)
102#define INIT_TASK_RCU_PREEMPT(tsk) \ 98#define INIT_TASK_RCU_PREEMPT(tsk) \
103 .rcu_read_lock_nesting = 0, \ 99 .rcu_read_lock_nesting = 0, \
104 .rcu_read_unlock_special = 0, \ 100 .rcu_read_unlock_special = 0, \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 26892f5e7bd8..ec90fc34fea9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -68,11 +68,9 @@ extern int rcu_scheduler_active;
68 68
69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
70#include <linux/rcutree.h> 70#include <linux/rcutree.h>
71#elif defined(CONFIG_PREEMPT_RCU)
72#include <linux/rcupreempt.h>
73#else 71#else
74#error "Unknown RCU implementation specified to kernel configuration" 72#error "Unknown RCU implementation specified to kernel configuration"
75#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ 73#endif
76 74
77#define RCU_HEAD_INIT { .next = NULL, .func = NULL } 75#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
78#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT 76#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
deleted file mode 100644
index a42ab88e9210..000000000000
--- a/include/linux/rcupreempt.h
+++ /dev/null
@@ -1,140 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of Read-Copy Update mechanism see -
29 * Documentation/RCU
30 *
31 */
32
33#ifndef __LINUX_RCUPREEMPT_H
34#define __LINUX_RCUPREEMPT_H
35
36#include <linux/cache.h>
37#include <linux/spinlock.h>
38#include <linux/threads.h>
39#include <linux/smp.h>
40#include <linux/cpumask.h>
41#include <linux/seqlock.h>
42
43extern void rcu_sched_qs(int cpu);
44static inline void rcu_bh_qs(int cpu) { }
45
46/*
47 * Someone might want to pass call_rcu_bh as a function pointer.
48 * So this needs to just be a rename and not a macro function.
49 * (no parentheses)
50 */
51#define call_rcu_bh call_rcu
52
53/**
54 * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
55 * @head: structure to be used for queueing the RCU updates.
56 * @func: actual update function to be invoked after the grace period
57 *
58 * The update function will be invoked some time after a full
59 * synchronize_sched()-style grace period elapses, in other words after
60 * all currently executing preempt-disabled sections of code (including
61 * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
62 * completed.
63 */
64extern void call_rcu_sched(struct rcu_head *head,
65 void (*func)(struct rcu_head *head));
66
67extern void __rcu_read_lock(void);
68extern void __rcu_read_unlock(void);
69extern int rcu_needs_cpu(int cpu);
70
71#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
72#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
73
74extern void __synchronize_sched(void);
75
76static inline void synchronize_rcu_expedited(void)
77{
78 synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */
79}
80
81static inline void synchronize_rcu_bh_expedited(void)
82{
83 synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */
84}
85
86extern void __rcu_init(void);
87extern void rcu_init_sched(void);
88extern void rcu_check_callbacks(int cpu, int user);
89extern void rcu_restart_cpu(int cpu);
90extern long rcu_batches_completed(void);
91
92/*
93 * Return the number of RCU batches processed thus far. Useful for debug
94 * and statistic. The _bh variant is identifcal to straight RCU
95 */
96static inline long rcu_batches_completed_bh(void)
97{
98 return rcu_batches_completed();
99}
100
101static inline void exit_rcu(void)
102{
103}
104
105#ifdef CONFIG_RCU_TRACE
106struct rcupreempt_trace;
107extern long *rcupreempt_flipctr(int cpu);
108extern long rcupreempt_data_completed(void);
109extern int rcupreempt_flip_flag(int cpu);
110extern int rcupreempt_mb_flag(int cpu);
111extern char *rcupreempt_try_flip_state_name(void);
112extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
113#endif
114
115struct softirq_action;
116
117#ifdef CONFIG_NO_HZ
118extern void rcu_enter_nohz(void);
119extern void rcu_exit_nohz(void);
120#else
121# define rcu_enter_nohz() do { } while (0)
122# define rcu_exit_nohz() do { } while (0)
123#endif
124
125/*
126 * A context switch is a grace period for rcupreempt synchronize_rcu()
127 * only during early boot, before the scheduler has been initialized.
128 * So, how the heck do we get a context switch? Well, if the caller
129 * invokes synchronize_rcu(), they are willing to accept a context
130 * switch, so we simply pretend that one happened.
131 *
132 * After boot, there might be a blocked or preempted task in an RCU
133 * read-side critical section, so we cannot then take the fastpath.
134 */
135static inline int rcu_blocking_is_gp(void)
136{
137 return num_online_cpus() == 1 && !rcu_scheduler_active;
138}
139
140#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h
deleted file mode 100644
index b99ae073192a..000000000000
--- a/include/linux/rcupreempt_trace.h
+++ /dev/null
@@ -1,97 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
29 * http://lwn.net/Articles/253651/
30 */
31
32#ifndef __LINUX_RCUPREEMPT_TRACE_H
33#define __LINUX_RCUPREEMPT_TRACE_H
34
35#include <linux/types.h>
36#include <linux/kernel.h>
37
38#include <asm/atomic.h>
39
40/*
41 * PREEMPT_RCU data structures.
42 */
43
44struct rcupreempt_trace {
45 long next_length;
46 long next_add;
47 long wait_length;
48 long wait_add;
49 long done_length;
50 long done_add;
51 long done_remove;
52 atomic_t done_invoked;
53 long rcu_check_callbacks;
54 atomic_t rcu_try_flip_1;
55 atomic_t rcu_try_flip_e1;
56 long rcu_try_flip_i1;
57 long rcu_try_flip_ie1;
58 long rcu_try_flip_g1;
59 long rcu_try_flip_a1;
60 long rcu_try_flip_ae1;
61 long rcu_try_flip_a2;
62 long rcu_try_flip_z1;
63 long rcu_try_flip_ze1;
64 long rcu_try_flip_z2;
65 long rcu_try_flip_m1;
66 long rcu_try_flip_me1;
67 long rcu_try_flip_m2;
68};
69
70#ifdef CONFIG_RCU_TRACE
71#define RCU_TRACE(fn, arg) fn(arg);
72#else
73#define RCU_TRACE(fn, arg)
74#endif
75
76extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
77extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
78extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
79extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
80extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
81extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
82extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
83extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
84extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
85extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
86extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
87extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
88extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
89extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
90extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
91extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
92extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
93extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
94extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
95extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
96
97#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7f98f637a2a..bfca26d63b13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1205,11 +1205,6 @@ struct task_struct {
1205 unsigned int policy; 1205 unsigned int policy;
1206 cpumask_t cpus_allowed; 1206 cpumask_t cpus_allowed;
1207 1207
1208#ifdef CONFIG_PREEMPT_RCU
1209 int rcu_read_lock_nesting;
1210 int rcu_flipctr_idx;
1211#endif /* #ifdef CONFIG_PREEMPT_RCU */
1212
1213#ifdef CONFIG_TREE_PREEMPT_RCU 1208#ifdef CONFIG_TREE_PREEMPT_RCU
1214 int rcu_read_lock_nesting; 1209 int rcu_read_lock_nesting;
1215 char rcu_read_unlock_special; 1210 char rcu_read_unlock_special;
@@ -1744,14 +1739,6 @@ static inline void rcu_copy_process(struct task_struct *p)
1744 INIT_LIST_HEAD(&p->rcu_node_entry); 1739 INIT_LIST_HEAD(&p->rcu_node_entry);
1745} 1740}
1746 1741
1747#elif defined(CONFIG_PREEMPT_RCU)
1748
1749static inline void rcu_copy_process(struct task_struct *p)
1750{
1751 p->rcu_read_lock_nesting = 0;
1752 p->rcu_flipctr_idx = 0;
1753}
1754
1755#else 1742#else
1756 1743
1757static inline void rcu_copy_process(struct task_struct *p) 1744static inline void rcu_copy_process(struct task_struct *p)
diff --git a/init/Kconfig b/init/Kconfig
index f88da2d1c1fb..8e8b76d8a272 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -324,17 +324,6 @@ config TREE_RCU
324 thousands of CPUs. It also scales down nicely to 324 thousands of CPUs. It also scales down nicely to
325 smaller systems. 325 smaller systems.
326 326
327config PREEMPT_RCU
328 bool "Preemptible RCU"
329 depends on PREEMPT
330 help
331 This option reduces the latency of the kernel by making certain
332 RCU sections preemptible. Normally RCU code is non-preemptible, if
333 this option is selected then read-only RCU sections become
334 preemptible. This helps latency, but may expose bugs due to
335 now-naive assumptions about each RCU read-side critical section
336 remaining on a given CPU through its execution.
337
338config TREE_PREEMPT_RCU 327config TREE_PREEMPT_RCU
339 bool "Preemptable tree-based hierarchical RCU" 328 bool "Preemptable tree-based hierarchical RCU"
340 depends on PREEMPT 329 depends on PREEMPT
@@ -348,7 +337,7 @@ endchoice
348 337
349config RCU_TRACE 338config RCU_TRACE
350 bool "Enable tracing for RCU" 339 bool "Enable tracing for RCU"
351 depends on TREE_RCU || PREEMPT_RCU || TREE_PREEMPT_RCU 340 depends on TREE_RCU || TREE_PREEMPT_RCU
352 help 341 help
353 This option provides tracing in RCU which presents stats 342 This option provides tracing in RCU which presents stats
354 in debugfs for debugging RCU implementation. 343 in debugfs for debugging RCU implementation.
@@ -395,13 +384,6 @@ config TREE_RCU_TRACE
395 TREE_PREEMPT_RCU implementations, permitting Makefile to 384 TREE_PREEMPT_RCU implementations, permitting Makefile to
396 trivially select kernel/rcutree_trace.c. 385 trivially select kernel/rcutree_trace.c.
397 386
398config PREEMPT_RCU_TRACE
399 def_bool RCU_TRACE && PREEMPT_RCU
400 select DEBUG_FS
401 help
402 This option provides tracing for the PREEMPT_RCU implementation,
403 permitting Makefile to trivially select kernel/rcupreempt_trace.c.
404
405endmenu # "RCU Subsystem" 387endmenu # "RCU Subsystem"
406 388
407config IKCONFIG 389config IKCONFIG
diff --git a/kernel/Makefile b/kernel/Makefile
index 1a38b4789dda..b833bd5cc127 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,9 +82,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index 0053ce56e326..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1518 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162static int rcu_pending(int cpu);
163
164void rcu_sched_qs(int cpu)
165{
166 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
167
168 rdssp->sched_qs++;
169}
170
171#ifdef CONFIG_NO_HZ
172
173void rcu_enter_nohz(void)
174{
175 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
176
177 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
178 __get_cpu_var(rcu_dyntick_sched).dynticks++;
179 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
180}
181
182void rcu_exit_nohz(void)
183{
184 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
185
186 __get_cpu_var(rcu_dyntick_sched).dynticks++;
187 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
188 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
189 &rs);
190}
191
192#endif /* CONFIG_NO_HZ */
193
194
195static DEFINE_PER_CPU(struct rcu_data, rcu_data);
196
197static struct rcu_ctrlblk rcu_ctrlblk = {
198 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
199 .completed = 0,
200 .rcu_try_flip_state = rcu_try_flip_idle_state,
201 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
202 .sched_sleep = rcu_sched_not_sleeping,
203 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
204};
205
206static struct task_struct *rcu_sched_grace_period_task;
207
208#ifdef CONFIG_RCU_TRACE
209static char *rcu_try_flip_state_names[] =
210 { "idle", "waitack", "waitzero", "waitmb" };
211#endif /* #ifdef CONFIG_RCU_TRACE */
212
213static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
214 = CPU_BITS_NONE;
215
216/*
217 * Enum and per-CPU flag to determine when each CPU has seen
218 * the most recent counter flip.
219 */
220
221enum rcu_flip_flag_values {
222 rcu_flip_seen, /* Steady/initial state, last flip seen. */
223 /* Only GP detector can update. */
224 rcu_flipped /* Flip just completed, need confirmation. */
225 /* Only corresponding CPU can update. */
226};
227static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
228 = rcu_flip_seen;
229
230/*
231 * Enum and per-CPU flag to determine when each CPU has executed the
232 * needed memory barrier to fence in memory references from its last RCU
233 * read-side critical section in the just-completed grace period.
234 */
235
236enum rcu_mb_flag_values {
237 rcu_mb_done, /* Steady/initial state, no mb()s required. */
238 /* Only GP detector can update. */
239 rcu_mb_needed /* Flip just completed, need an mb(). */
240 /* Only corresponding CPU can update. */
241};
242static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
243 = rcu_mb_done;
244
245/*
246 * RCU_DATA_ME: find the current CPU's rcu_data structure.
247 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
248 */
249#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
250#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
251
252/*
253 * Helper macro for tracing when the appropriate rcu_data is not
254 * cached in a local variable, but where the CPU number is so cached.
255 */
256#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
257
258/*
259 * Helper macro for tracing when the appropriate rcu_data is not
260 * cached in a local variable.
261 */
262#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
263
264/*
265 * Helper macro for tracing when the appropriate rcu_data is pointed
266 * to by a local variable.
267 */
268#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
269
270#define RCU_SCHED_BATCH_TIME (HZ / 50)
271
272/*
273 * Return the number of RCU batches processed thus far. Useful
274 * for debug and statistics.
275 */
276long rcu_batches_completed(void)
277{
278 return rcu_ctrlblk.completed;
279}
280EXPORT_SYMBOL_GPL(rcu_batches_completed);
281
282void __rcu_read_lock(void)
283{
284 int idx;
285 struct task_struct *t = current;
286 int nesting;
287
288 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
289 if (nesting != 0) {
290
291 /* An earlier rcu_read_lock() covers us, just count it. */
292
293 t->rcu_read_lock_nesting = nesting + 1;
294
295 } else {
296 unsigned long flags;
297
298 /*
299 * We disable interrupts for the following reasons:
300 * - If we get scheduling clock interrupt here, and we
301 * end up acking the counter flip, it's like a promise
302 * that we will never increment the old counter again.
303 * Thus we will break that promise if that
304 * scheduling clock interrupt happens between the time
305 * we pick the .completed field and the time that we
306 * increment our counter.
307 *
308 * - We don't want to be preempted out here.
309 *
310 * NMIs can still occur, of course, and might themselves
311 * contain rcu_read_lock().
312 */
313
314 local_irq_save(flags);
315
316 /*
317 * Outermost nesting of rcu_read_lock(), so increment
318 * the current counter for the current CPU. Use volatile
319 * casts to prevent the compiler from reordering.
320 */
321
322 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
323 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
324
325 /*
326 * Now that the per-CPU counter has been incremented, we
327 * are protected from races with rcu_read_lock() invoked
328 * from NMI handlers on this CPU. We can therefore safely
329 * increment the nesting counter, relieving further NMIs
330 * of the need to increment the per-CPU counter.
331 */
332
333 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
334
335 /*
336 * Now that we have preventing any NMIs from storing
337 * to the ->rcu_flipctr_idx, we can safely use it to
338 * remember which counter to decrement in the matching
339 * rcu_read_unlock().
340 */
341
342 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
343 local_irq_restore(flags);
344 }
345}
346EXPORT_SYMBOL_GPL(__rcu_read_lock);
347
348void __rcu_read_unlock(void)
349{
350 int idx;
351 struct task_struct *t = current;
352 int nesting;
353
354 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
355 if (nesting > 1) {
356
357 /*
358 * We are still protected by the enclosing rcu_read_lock(),
359 * so simply decrement the counter.
360 */
361
362 t->rcu_read_lock_nesting = nesting - 1;
363
364 } else {
365 unsigned long flags;
366
367 /*
368 * Disable local interrupts to prevent the grace-period
369 * detection state machine from seeing us half-done.
370 * NMIs can still occur, of course, and might themselves
371 * contain rcu_read_lock() and rcu_read_unlock().
372 */
373
374 local_irq_save(flags);
375
376 /*
377 * Outermost nesting of rcu_read_unlock(), so we must
378 * decrement the current counter for the current CPU.
379 * This must be done carefully, because NMIs can
380 * occur at any point in this code, and any rcu_read_lock()
381 * and rcu_read_unlock() pairs in the NMI handlers
382 * must interact non-destructively with this code.
383 * Lots of volatile casts, and -very- careful ordering.
384 *
385 * Changes to this code, including this one, must be
386 * inspected, validated, and tested extremely carefully!!!
387 */
388
389 /*
390 * First, pick up the index.
391 */
392
393 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
394
395 /*
396 * Now that we have fetched the counter index, it is
397 * safe to decrement the per-task RCU nesting counter.
398 * After this, any interrupts or NMIs will increment and
399 * decrement the per-CPU counters.
400 */
401 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
402
403 /*
404 * It is now safe to decrement this task's nesting count.
405 * NMIs that occur after this statement will route their
406 * rcu_read_lock() calls through this "else" clause, and
407 * will thus start incrementing the per-CPU counter on
408 * their own. They will also clobber ->rcu_flipctr_idx,
409 * but that is OK, since we have already fetched it.
410 */
411
412 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
413 local_irq_restore(flags);
414 }
415}
416EXPORT_SYMBOL_GPL(__rcu_read_unlock);
417
418/*
419 * If a global counter flip has occurred since the last time that we
420 * advanced callbacks, advance them. Hardware interrupts must be
421 * disabled when calling this function.
422 */
423static void __rcu_advance_callbacks(struct rcu_data *rdp)
424{
425 int cpu;
426 int i;
427 int wlc = 0;
428
429 if (rdp->completed != rcu_ctrlblk.completed) {
430 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
431 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
432 rdp->donetail = rdp->waittail[GP_STAGES - 1];
433 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
434 }
435 for (i = GP_STAGES - 2; i >= 0; i--) {
436 if (rdp->waitlist[i] != NULL) {
437 rdp->waitlist[i + 1] = rdp->waitlist[i];
438 rdp->waittail[i + 1] = rdp->waittail[i];
439 wlc++;
440 } else {
441 rdp->waitlist[i + 1] = NULL;
442 rdp->waittail[i + 1] =
443 &rdp->waitlist[i + 1];
444 }
445 }
446 if (rdp->nextlist != NULL) {
447 rdp->waitlist[0] = rdp->nextlist;
448 rdp->waittail[0] = rdp->nexttail;
449 wlc++;
450 rdp->nextlist = NULL;
451 rdp->nexttail = &rdp->nextlist;
452 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
453 } else {
454 rdp->waitlist[0] = NULL;
455 rdp->waittail[0] = &rdp->waitlist[0];
456 }
457 rdp->waitlistcount = wlc;
458 rdp->completed = rcu_ctrlblk.completed;
459 }
460
461 /*
462 * Check to see if this CPU needs to report that it has seen
463 * the most recent counter flip, thereby declaring that all
464 * subsequent rcu_read_lock() invocations will respect this flip.
465 */
466
467 cpu = raw_smp_processor_id();
468 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
469 smp_mb(); /* Subsequent counter accesses must see new value */
470 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
471 smp_mb(); /* Subsequent RCU read-side critical sections */
472 /* seen -after- acknowledgement. */
473 }
474}
475
476#ifdef CONFIG_NO_HZ
477static DEFINE_PER_CPU(int, rcu_update_flag);
478
479/**
480 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
481 *
482 * If the CPU was idle with dynamic ticks active, this updates the
483 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
484 * CPU is active.
485 */
486void rcu_irq_enter(void)
487{
488 int cpu = smp_processor_id();
489 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
490
491 if (per_cpu(rcu_update_flag, cpu))
492 per_cpu(rcu_update_flag, cpu)++;
493
494 /*
495 * Only update if we are coming from a stopped ticks mode
496 * (rcu_dyntick_sched.dynticks is even).
497 */
498 if (!in_interrupt() &&
499 (rdssp->dynticks & 0x1) == 0) {
500 /*
501 * The following might seem like we could have a race
502 * with NMI/SMIs. But this really isn't a problem.
503 * Here we do a read/modify/write, and the race happens
504 * when an NMI/SMI comes in after the read and before
505 * the write. But NMI/SMIs will increment this counter
506 * twice before returning, so the zero bit will not
507 * be corrupted by the NMI/SMI which is the most important
508 * part.
509 *
510 * The only thing is that we would bring back the counter
511 * to a postion that it was in during the NMI/SMI.
512 * But the zero bit would be set, so the rest of the
513 * counter would again be ignored.
514 *
515 * On return from the IRQ, the counter may have the zero
516 * bit be 0 and the counter the same as the return from
517 * the NMI/SMI. If the state machine was so unlucky to
518 * see that, it still doesn't matter, since all
519 * RCU read-side critical sections on this CPU would
520 * have already completed.
521 */
522 rdssp->dynticks++;
523 /*
524 * The following memory barrier ensures that any
525 * rcu_read_lock() primitives in the irq handler
526 * are seen by other CPUs to follow the above
527 * increment to rcu_dyntick_sched.dynticks. This is
528 * required in order for other CPUs to correctly
529 * determine when it is safe to advance the RCU
530 * grace-period state machine.
531 */
532 smp_mb(); /* see above block comment. */
533 /*
534 * Since we can't determine the dynamic tick mode from
535 * the rcu_dyntick_sched.dynticks after this routine,
536 * we use a second flag to acknowledge that we came
537 * from an idle state with ticks stopped.
538 */
539 per_cpu(rcu_update_flag, cpu)++;
540 /*
541 * If we take an NMI/SMI now, they will also increment
542 * the rcu_update_flag, and will not update the
543 * rcu_dyntick_sched.dynticks on exit. That is for
544 * this IRQ to do.
545 */
546 }
547}
548
549/**
550 * rcu_irq_exit - Called from exiting Hard irq context.
551 *
552 * If the CPU was idle with dynamic ticks active, update the
553 * rcu_dyntick_sched.dynticks to let the RCU handling be
554 * aware that the CPU is going back to idle with no ticks.
555 */
556void rcu_irq_exit(void)
557{
558 int cpu = smp_processor_id();
559 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
560
561 /*
562 * rcu_update_flag is set if we interrupted the CPU
563 * when it was idle with ticks stopped.
564 * Once this occurs, we keep track of interrupt nesting
565 * because a NMI/SMI could also come in, and we still
566 * only want the IRQ that started the increment of the
567 * rcu_dyntick_sched.dynticks to be the one that modifies
568 * it on exit.
569 */
570 if (per_cpu(rcu_update_flag, cpu)) {
571 if (--per_cpu(rcu_update_flag, cpu))
572 return;
573
574 /* This must match the interrupt nesting */
575 WARN_ON(in_interrupt());
576
577 /*
578 * If an NMI/SMI happens now we are still
579 * protected by the rcu_dyntick_sched.dynticks being odd.
580 */
581
582 /*
583 * The following memory barrier ensures that any
584 * rcu_read_unlock() primitives in the irq handler
585 * are seen by other CPUs to preceed the following
586 * increment to rcu_dyntick_sched.dynticks. This
587 * is required in order for other CPUs to determine
588 * when it is safe to advance the RCU grace-period
589 * state machine.
590 */
591 smp_mb(); /* see above block comment. */
592 rdssp->dynticks++;
593 WARN_ON(rdssp->dynticks & 0x1);
594 }
595}
596
597void rcu_nmi_enter(void)
598{
599 rcu_irq_enter();
600}
601
602void rcu_nmi_exit(void)
603{
604 rcu_irq_exit();
605}
606
607static void dyntick_save_progress_counter(int cpu)
608{
609 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
610
611 rdssp->dynticks_snap = rdssp->dynticks;
612}
613
614static inline int
615rcu_try_flip_waitack_needed(int cpu)
616{
617 long curr;
618 long snap;
619 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
620
621 curr = rdssp->dynticks;
622 snap = rdssp->dynticks_snap;
623 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
624
625 /*
626 * If the CPU remained in dynticks mode for the entire time
627 * and didn't take any interrupts, NMIs, SMIs, or whatever,
628 * then it cannot be in the middle of an rcu_read_lock(), so
629 * the next rcu_read_lock() it executes must use the new value
630 * of the counter. So we can safely pretend that this CPU
631 * already acknowledged the counter.
632 */
633
634 if ((curr == snap) && ((curr & 0x1) == 0))
635 return 0;
636
637 /*
638 * If the CPU passed through or entered a dynticks idle phase with
639 * no active irq handlers, then, as above, we can safely pretend
640 * that this CPU already acknowledged the counter.
641 */
642
643 if ((curr - snap) > 2 || (curr & 0x1) == 0)
644 return 0;
645
646 /* We need this CPU to explicitly acknowledge the counter flip. */
647
648 return 1;
649}
650
651static inline int
652rcu_try_flip_waitmb_needed(int cpu)
653{
654 long curr;
655 long snap;
656 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
657
658 curr = rdssp->dynticks;
659 snap = rdssp->dynticks_snap;
660 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
661
662 /*
663 * If the CPU remained in dynticks mode for the entire time
664 * and didn't take any interrupts, NMIs, SMIs, or whatever,
665 * then it cannot have executed an RCU read-side critical section
666 * during that time, so there is no need for it to execute a
667 * memory barrier.
668 */
669
670 if ((curr == snap) && ((curr & 0x1) == 0))
671 return 0;
672
673 /*
674 * If the CPU either entered or exited an outermost interrupt,
675 * SMI, NMI, or whatever handler, then we know that it executed
676 * a memory barrier when doing so. So we don't need another one.
677 */
678 if (curr != snap)
679 return 0;
680
681 /* We need the CPU to execute a memory barrier. */
682
683 return 1;
684}
685
686static void dyntick_save_progress_counter_sched(int cpu)
687{
688 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
689
690 rdssp->sched_dynticks_snap = rdssp->dynticks;
691}
692
693static int rcu_qsctr_inc_needed_dyntick(int cpu)
694{
695 long curr;
696 long snap;
697 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
698
699 curr = rdssp->dynticks;
700 snap = rdssp->sched_dynticks_snap;
701 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
702
703 /*
704 * If the CPU remained in dynticks mode for the entire time
705 * and didn't take any interrupts, NMIs, SMIs, or whatever,
706 * then it cannot be in the middle of an rcu_read_lock(), so
707 * the next rcu_read_lock() it executes must use the new value
708 * of the counter. Therefore, this CPU has been in a quiescent
709 * state the entire time, and we don't need to wait for it.
710 */
711
712 if ((curr == snap) && ((curr & 0x1) == 0))
713 return 0;
714
715 /*
716 * If the CPU passed through or entered a dynticks idle phase with
717 * no active irq handlers, then, as above, this CPU has already
718 * passed through a quiescent state.
719 */
720
721 if ((curr - snap) > 2 || (snap & 0x1) == 0)
722 return 0;
723
724 /* We need this CPU to go through a quiescent state. */
725
726 return 1;
727}
728
729#else /* !CONFIG_NO_HZ */
730
731# define dyntick_save_progress_counter(cpu) do { } while (0)
732# define rcu_try_flip_waitack_needed(cpu) (1)
733# define rcu_try_flip_waitmb_needed(cpu) (1)
734
735# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
736# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
737
738#endif /* CONFIG_NO_HZ */
739
740static void save_qsctr_sched(int cpu)
741{
742 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
743
744 rdssp->sched_qs_snap = rdssp->sched_qs;
745}
746
747static inline int rcu_qsctr_inc_needed(int cpu)
748{
749 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
750
751 /*
752 * If there has been a quiescent state, no more need to wait
753 * on this CPU.
754 */
755
756 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
757 smp_mb(); /* force ordering with cpu entering schedule(). */
758 return 0;
759 }
760
761 /* We need this CPU to go through a quiescent state. */
762
763 return 1;
764}
765
766/*
767 * Get here when RCU is idle. Decide whether we need to
768 * move out of idle state, and return non-zero if so.
769 * "Straightforward" approach for the moment, might later
770 * use callback-list lengths, grace-period duration, or
771 * some such to determine when to exit idle state.
772 * Might also need a pre-idle test that does not acquire
773 * the lock, but let's get the simple case working first...
774 */
775
776static int
777rcu_try_flip_idle(void)
778{
779 int cpu;
780
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
782 if (!rcu_pending(smp_processor_id())) {
783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
784 return 0;
785 }
786
787 /*
788 * Do the flip.
789 */
790
791 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
792 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
793
794 /*
795 * Need a memory barrier so that other CPUs see the new
796 * counter value before they see the subsequent change of all
797 * the rcu_flip_flag instances to rcu_flipped.
798 */
799
800 smp_mb(); /* see above block comment. */
801
802 /* Now ask each CPU for acknowledgement of the flip. */
803
804 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
805 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
806 dyntick_save_progress_counter(cpu);
807 }
808
809 return 1;
810}
811
812/*
813 * Wait for CPUs to acknowledge the flip.
814 */
815
816static int
817rcu_try_flip_waitack(void)
818{
819 int cpu;
820
821 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
822 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
823 if (rcu_try_flip_waitack_needed(cpu) &&
824 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
825 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
826 return 0;
827 }
828
829 /*
830 * Make sure our checks above don't bleed into subsequent
831 * waiting for the sum of the counters to reach zero.
832 */
833
834 smp_mb(); /* see above block comment. */
835 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
836 return 1;
837}
838
839/*
840 * Wait for collective ``last'' counter to reach zero,
841 * then tell all CPUs to do an end-of-grace-period memory barrier.
842 */
843
844static int
845rcu_try_flip_waitzero(void)
846{
847 int cpu;
848 int lastidx = !(rcu_ctrlblk.completed & 0x1);
849 int sum = 0;
850
851 /* Check to see if the sum of the "last" counters is zero. */
852
853 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
854 for_each_possible_cpu(cpu)
855 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
856 if (sum != 0) {
857 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
858 return 0;
859 }
860
861 /*
862 * This ensures that the other CPUs see the call for
863 * memory barriers -after- the sum to zero has been
864 * detected here
865 */
866 smp_mb(); /* ^^^^^^^^^^^^ */
867
868 /* Call for a memory barrier from each CPU. */
869 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
870 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
871 dyntick_save_progress_counter(cpu);
872 }
873
874 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
875 return 1;
876}
877
878/*
879 * Wait for all CPUs to do their end-of-grace-period memory barrier.
880 * Return 0 once all CPUs have done so.
881 */
882
883static int
884rcu_try_flip_waitmb(void)
885{
886 int cpu;
887
888 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
889 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
890 if (rcu_try_flip_waitmb_needed(cpu) &&
891 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
892 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
893 return 0;
894 }
895
896 smp_mb(); /* Ensure that the above checks precede any following flip. */
897 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
898 return 1;
899}
900
901/*
902 * Attempt a single flip of the counters. Remember, a single flip does
903 * -not- constitute a grace period. Instead, the interval between
904 * at least GP_STAGES consecutive flips is a grace period.
905 *
906 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
907 * on a large SMP, they might want to use a hierarchical organization of
908 * the per-CPU-counter pairs.
909 */
910static void rcu_try_flip(void)
911{
912 unsigned long flags;
913
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
915 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
916 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
917 return;
918 }
919
920 /*
921 * Take the next transition(s) through the RCU grace-period
922 * flip-counter state machine.
923 */
924
925 switch (rcu_ctrlblk.rcu_try_flip_state) {
926 case rcu_try_flip_idle_state:
927 if (rcu_try_flip_idle())
928 rcu_ctrlblk.rcu_try_flip_state =
929 rcu_try_flip_waitack_state;
930 break;
931 case rcu_try_flip_waitack_state:
932 if (rcu_try_flip_waitack())
933 rcu_ctrlblk.rcu_try_flip_state =
934 rcu_try_flip_waitzero_state;
935 break;
936 case rcu_try_flip_waitzero_state:
937 if (rcu_try_flip_waitzero())
938 rcu_ctrlblk.rcu_try_flip_state =
939 rcu_try_flip_waitmb_state;
940 break;
941 case rcu_try_flip_waitmb_state:
942 if (rcu_try_flip_waitmb())
943 rcu_ctrlblk.rcu_try_flip_state =
944 rcu_try_flip_idle_state;
945 }
946 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
947}
948
949/*
950 * Check to see if this CPU needs to do a memory barrier in order to
951 * ensure that any prior RCU read-side critical sections have committed
952 * their counter manipulations and critical-section memory references
953 * before declaring the grace period to be completed.
954 */
955static void rcu_check_mb(int cpu)
956{
957 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
958 smp_mb(); /* Ensure RCU read-side accesses are visible. */
959 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
960 }
961}
962
963void rcu_check_callbacks(int cpu, int user)
964{
965 unsigned long flags;
966 struct rcu_data *rdp;
967
968 if (!rcu_pending(cpu))
969 return; /* if nothing for RCU to do. */
970
971 /*
972 * If this CPU took its interrupt from user mode or from the
973 * idle loop, and this is not a nested interrupt, then
974 * this CPU has to have exited all prior preept-disable
975 * sections of code. So invoke rcu_sched_qs() to note this.
976 *
977 * The memory barrier is needed to handle the case where
978 * writes from a preempt-disable section of code get reordered
979 * into schedule() by this CPU's write buffer. So the memory
980 * barrier makes sure that the rcu_sched_qs() is seen by other
981 * CPUs to happen after any such write.
982 */
983
984 rdp = RCU_DATA_CPU(cpu);
985 if (user ||
986 (idle_cpu(cpu) && !in_softirq() &&
987 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
988 smp_mb(); /* Guard against aggressive schedule(). */
989 rcu_sched_qs(cpu);
990 }
991
992 rcu_check_mb(cpu);
993 if (rcu_ctrlblk.completed == rdp->completed)
994 rcu_try_flip();
995 spin_lock_irqsave(&rdp->lock, flags);
996 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
997 __rcu_advance_callbacks(rdp);
998 if (rdp->donelist == NULL) {
999 spin_unlock_irqrestore(&rdp->lock, flags);
1000 } else {
1001 spin_unlock_irqrestore(&rdp->lock, flags);
1002 raise_softirq(RCU_SOFTIRQ);
1003 }
1004}
1005
1006/*
1007 * Needed by dynticks, to make sure all RCU processing has finished
1008 * when we go idle:
1009 */
1010void rcu_advance_callbacks(int cpu, int user)
1011{
1012 unsigned long flags;
1013 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1014
1015 if (rcu_ctrlblk.completed == rdp->completed) {
1016 rcu_try_flip();
1017 if (rcu_ctrlblk.completed == rdp->completed)
1018 return;
1019 }
1020 spin_lock_irqsave(&rdp->lock, flags);
1021 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1022 __rcu_advance_callbacks(rdp);
1023 spin_unlock_irqrestore(&rdp->lock, flags);
1024}
1025
1026#ifdef CONFIG_HOTPLUG_CPU
1027#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1028 *dsttail = srclist; \
1029 if (srclist != NULL) { \
1030 dsttail = srctail; \
1031 srclist = NULL; \
1032 srctail = &srclist;\
1033 } \
1034 } while (0)
1035
1036void rcu_offline_cpu(int cpu)
1037{
1038 int i;
1039 struct rcu_head *list = NULL;
1040 unsigned long flags;
1041 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1042 struct rcu_head *schedlist = NULL;
1043 struct rcu_head **schedtail = &schedlist;
1044 struct rcu_head **tail = &list;
1045
1046 /*
1047 * Remove all callbacks from the newly dead CPU, retaining order.
1048 * Otherwise rcu_barrier() will fail
1049 */
1050
1051 spin_lock_irqsave(&rdp->lock, flags);
1052 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1053 for (i = GP_STAGES - 1; i >= 0; i--)
1054 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1055 list, tail);
1056 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1057 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1058 schedlist, schedtail);
1059 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1060 schedlist, schedtail);
1061 rdp->rcu_sched_sleeping = 0;
1062 spin_unlock_irqrestore(&rdp->lock, flags);
1063 rdp->waitlistcount = 0;
1064
1065 /* Disengage the newly dead CPU from the grace-period computation. */
1066
1067 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1068 rcu_check_mb(cpu);
1069 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1070 smp_mb(); /* Subsequent counter accesses must see new value */
1071 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1072 smp_mb(); /* Subsequent RCU read-side critical sections */
1073 /* seen -after- acknowledgement. */
1074 }
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391static int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442void __init __rcu_init(void)
1443{
1444 int cpu;
1445 int i;
1446 struct rcu_data *rdp;
1447
1448 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1449 for_each_possible_cpu(cpu) {
1450 rdp = RCU_DATA_CPU(cpu);
1451 spin_lock_init(&rdp->lock);
1452 rdp->completed = 0;
1453 rdp->waitlistcount = 0;
1454 rdp->nextlist = NULL;
1455 rdp->nexttail = &rdp->nextlist;
1456 for (i = 0; i < GP_STAGES; i++) {
1457 rdp->waitlist[i] = NULL;
1458 rdp->waittail[i] = &rdp->waitlist[i];
1459 }
1460 rdp->donelist = NULL;
1461 rdp->donetail = &rdp->donelist;
1462 rdp->rcu_flipctr[0] = 0;
1463 rdp->rcu_flipctr[1] = 0;
1464 rdp->nextschedlist = NULL;
1465 rdp->nextschedtail = &rdp->nextschedlist;
1466 rdp->waitschedlist = NULL;
1467 rdp->waitschedtail = &rdp->waitschedlist;
1468 rdp->rcu_sched_sleeping = 0;
1469 }
1470 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1471}
1472
1473/*
1474 * Late-boot-time RCU initialization that must wait until after scheduler
1475 * has been initialized.
1476 */
1477void __init rcu_init_sched(void)
1478{
1479 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1480 NULL,
1481 "rcu_sched_grace_period");
1482 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1483}
1484
1485#ifdef CONFIG_RCU_TRACE
1486long *rcupreempt_flipctr(int cpu)
1487{
1488 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1489}
1490EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1491
1492int rcupreempt_flip_flag(int cpu)
1493{
1494 return per_cpu(rcu_flip_flag, cpu);
1495}
1496EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1497
1498int rcupreempt_mb_flag(int cpu)
1499{
1500 return per_cpu(rcu_mb_flag, cpu);
1501}
1502EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1503
1504char *rcupreempt_try_flip_state_name(void)
1505{
1506 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1507}
1508EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1509
1510struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1511{
1512 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1513
1514 return &rdp->trace;
1515}
1516EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1517
1518#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 11640346a507..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,335 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_possible_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d%c %4ld %3ld %d %d\n",
244 cpu,
245 cpu_is_offline(cpu) ? '!' : ' ',
246 flipctr[!f],
247 flipctr[f],
248 rcupreempt_flip_flag(cpu),
249 rcupreempt_mb_flag(cpu));
250 }
251 cnt += snprintf(&rcupreempt_trace_buf[cnt],
252 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
253 "ggp = %ld, state = %s\n",
254 rcu_batches_completed(),
255 rcupreempt_try_flip_state_name());
256 cnt += snprintf(&rcupreempt_trace_buf[cnt],
257 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
258 "\n");
259 bcount = simple_read_from_buffer(buffer, count, ppos,
260 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
261 mutex_unlock(&rcupreempt_trace_mutex);
262 return bcount;
263}
264
265static struct file_operations rcustats_fops = {
266 .owner = THIS_MODULE,
267 .read = rcustats_read,
268};
269
270static struct file_operations rcugp_fops = {
271 .owner = THIS_MODULE,
272 .read = rcugp_read,
273};
274
275static struct file_operations rcuctrs_fops = {
276 .owner = THIS_MODULE,
277 .read = rcuctrs_read,
278};
279
280static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
281static int rcupreempt_debugfs_init(void)
282{
283 rcudir = debugfs_create_dir("rcu", NULL);
284 if (!rcudir)
285 goto out;
286 statdir = debugfs_create_file("rcustats", 0444, rcudir,
287 NULL, &rcustats_fops);
288 if (!statdir)
289 goto free_out;
290
291 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
292 if (!gpdir)
293 goto free_out;
294
295 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
296 NULL, &rcuctrs_fops);
297 if (!ctrsdir)
298 goto free_out;
299 return 0;
300free_out:
301 if (statdir)
302 debugfs_remove(statdir);
303 if (gpdir)
304 debugfs_remove(gpdir);
305 debugfs_remove(rcudir);
306out:
307 return 1;
308}
309
310static int __init rcupreempt_trace_init(void)
311{
312 int ret;
313
314 mutex_init(&rcupreempt_trace_mutex);
315 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
316 if (!rcupreempt_trace_buf)
317 return 1;
318 ret = rcupreempt_debugfs_init();
319 if (ret)
320 kfree(rcupreempt_trace_buf);
321 return ret;
322}
323
324static void __exit rcupreempt_trace_cleanup(void)
325{
326 debugfs_remove(statdir);
327 debugfs_remove(gpdir);
328 debugfs_remove(ctrsdir);
329 debugfs_remove(rcudir);
330 kfree(rcupreempt_trace_buf);
331}
332
333
334module_init(rcupreempt_trace_init);
335module_exit(rcupreempt_trace_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f87fb0c8f924..82fbc49728df 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -725,7 +725,7 @@ config RCU_TORTURE_TEST_RUNNABLE
725 725
726config RCU_CPU_STALL_DETECTOR 726config RCU_CPU_STALL_DETECTOR
727 bool "Check for stalled CPUs delaying RCU grace periods" 727 bool "Check for stalled CPUs delaying RCU grace periods"
728 depends on CLASSIC_RCU || TREE_RCU || TREE_PREEMPT_RCU 728 depends on TREE_RCU || TREE_PREEMPT_RCU
729 default n 729 default n
730 help 730 help
731 This option causes RCU to printk information on which 731 This option causes RCU to printk information on which