aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/torture.txt15
-rw-r--r--Documentation/kernel-parameters.txt88
-rw-r--r--MAINTAINERS14
-rw-r--r--arch/um/drivers/mconsole_kern.c1
-rw-r--r--include/linux/rculist.h40
-rw-r--r--include/linux/rcupdate.h20
-rw-r--r--include/linux/rcutiny.h11
-rw-r--r--include/linux/rcutree.h19
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/srcu.h48
-rw-r--r--include/trace/events/rcu.h2
-rw-r--r--init/Kconfig50
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/timer.c8
-rw-r--r--lib/list_debug.c22
23 files changed, 1358 insertions, 353 deletions
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 375d3fb71437..4ddf3913fd8c 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -47,6 +47,16 @@ irqreader Says to invoke RCU readers from irq level. This is currently
47 permit this. (Or, more accurately, variants of RCU that do 47 permit this. (Or, more accurately, variants of RCU that do
48 -not- permit this know to ignore this variable.) 48 -not- permit this know to ignore this variable.)
49 49
50n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted,
51 in which case n_barrier_cbs specifies the number of
52 RCU callbacks (and corresponding kthreads) to use for
53 this testing. The value cannot be negative. If you
54 specify this to be non-zero when torture_type indicates a
55 synchronous RCU implementation (one for which a member of
56 the synchronize_rcu() rather than the call_rcu() family is
57 used -- see the documentation for torture_type below), an
58 error will be reported and no testing will be carried out.
59
50nfakewriters This is the number of RCU fake writer threads to run. Fake 60nfakewriters This is the number of RCU fake writer threads to run. Fake
51 writer threads repeatedly use the synchronous "wait for 61 writer threads repeatedly use the synchronous "wait for
52 current readers" function of the interface selected by 62 current readers" function of the interface selected by
@@ -188,7 +198,7 @@ OUTPUT
188The statistics output is as follows: 198The statistics output is as follows:
189 199
190 rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 200 rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
191 rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 201 rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
192 rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 202 rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0
193 rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 203 rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0
194 rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 204 rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0
@@ -230,6 +240,9 @@ o "rtmbe": A non-zero value indicates that rcutorture believes that
230 rcu_assign_pointer() and rcu_dereference() are not working 240 rcu_assign_pointer() and rcu_dereference() are not working
231 correctly. This value should be zero. 241 correctly. This value should be zero.
232 242
243o "rtbe": A non-zero value indicates that one of the rcu_barrier()
244 family of functions is not working correctly.
245
233o "rtbke": rcutorture was unable to create the real-time kthreads 246o "rtbke": rcutorture was unable to create the real-time kthreads
234 used to force RCU priority inversion. This value should be zero. 247 used to force RCU priority inversion. This value should be zero.
235 248
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f995195409fd..0e90453e4acb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2333,18 +2333,100 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2333 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes 2333 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
2334 See Documentation/blockdev/ramdisk.txt. 2334 See Documentation/blockdev/ramdisk.txt.
2335 2335
2336 rcupdate.blimit= [KNL,BOOT] 2336 rcutree.blimit= [KNL,BOOT]
2337 Set maximum number of finished RCU callbacks to process 2337 Set maximum number of finished RCU callbacks to process
2338 in one batch. 2338 in one batch.
2339 2339
2340 rcupdate.qhimark= [KNL,BOOT] 2340 rcutree.qhimark= [KNL,BOOT]
2341 Set threshold of queued 2341 Set threshold of queued
2342 RCU callbacks over which batch limiting is disabled. 2342 RCU callbacks over which batch limiting is disabled.
2343 2343
2344 rcupdate.qlowmark= [KNL,BOOT] 2344 rcutree.qlowmark= [KNL,BOOT]
2345 Set threshold of queued RCU callbacks below which 2345 Set threshold of queued RCU callbacks below which
2346 batch limiting is re-enabled. 2346 batch limiting is re-enabled.
2347 2347
2348 rcutree.rcu_cpu_stall_suppress= [KNL,BOOT]
2349 Suppress RCU CPU stall warning messages.
2350
2351 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2352 Set timeout for RCU CPU stall warning messages.
2353
2354 rcutorture.fqs_duration= [KNL,BOOT]
2355 Set duration of force_quiescent_state bursts.
2356
2357 rcutorture.fqs_holdoff= [KNL,BOOT]
2358 Set holdoff time within force_quiescent_state bursts.
2359
2360 rcutorture.fqs_stutter= [KNL,BOOT]
2361 Set wait time between force_quiescent_state bursts.
2362
2363 rcutorture.irqreader= [KNL,BOOT]
2364 Test RCU readers from irq handlers.
2365
2366 rcutorture.n_barrier_cbs= [KNL,BOOT]
2367 Set callbacks/threads for rcu_barrier() testing.
2368
2369 rcutorture.nfakewriters= [KNL,BOOT]
2370 Set number of concurrent RCU writers. These just
2371 stress RCU, they don't participate in the actual
2372 test, hence the "fake".
2373
2374 rcutorture.nreaders= [KNL,BOOT]
2375 Set number of RCU readers.
2376
2377 rcutorture.onoff_holdoff= [KNL,BOOT]
2378 Set time (s) after boot for CPU-hotplug testing.
2379
2380 rcutorture.onoff_interval= [KNL,BOOT]
2381 Set time (s) between CPU-hotplug operations, or
2382 zero to disable CPU-hotplug testing.
2383
2384 rcutorture.shuffle_interval= [KNL,BOOT]
2385 Set task-shuffle interval (s). Shuffling tasks
2386 allows some CPUs to go into dyntick-idle mode
2387 during the rcutorture test.
2388
2389 rcutorture.shutdown_secs= [KNL,BOOT]
2390 Set time (s) after boot system shutdown. This
2391 is useful for hands-off automated testing.
2392
2393 rcutorture.stall_cpu= [KNL,BOOT]
2394 Duration of CPU stall (s) to test RCU CPU stall
2395 warnings, zero to disable.
2396
2397 rcutorture.stall_cpu_holdoff= [KNL,BOOT]
2398 Time to wait (s) after boot before inducing stall.
2399
2400 rcutorture.stat_interval= [KNL,BOOT]
2401 Time (s) between statistics printk()s.
2402
2403 rcutorture.stutter= [KNL,BOOT]
2404 Time (s) to stutter testing, for example, specifying
2405 five seconds causes the test to run for five seconds,
2406 wait for five seconds, and so on. This tests RCU's
2407 ability to transition abruptly to and from idle.
2408
2409 rcutorture.test_boost= [KNL,BOOT]
2410 Test RCU priority boosting? 0=no, 1=maybe, 2=yes.
2411 "Maybe" means test if the RCU implementation
2412 under test support RCU priority boosting.
2413
2414 rcutorture.test_boost_duration= [KNL,BOOT]
2415 Duration (s) of each individual boost test.
2416
2417 rcutorture.test_boost_interval= [KNL,BOOT]
2418 Interval (s) between each boost test.
2419
2420 rcutorture.test_no_idle_hz= [KNL,BOOT]
2421 Test RCU's dyntick-idle handling. See also the
2422 rcutorture.shuffle_interval parameter.
2423
2424 rcutorture.torture_type= [KNL,BOOT]
2425 Specify the RCU implementation to test.
2426
2427 rcutorture.verbose= [KNL,BOOT]
2428 Enable additional printk() statements.
2429
2348 rdinit= [KNL] 2430 rdinit= [KNL]
2349 Format: <full_path> 2431 Format: <full_path>
2350 Run specified binary instead of /init from the ramdisk, 2432 Run specified binary instead of /init from the ramdisk,
diff --git a/MAINTAINERS b/MAINTAINERS
index 73a8b561414b..5ccca1ca0077 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5598,14 +5598,13 @@ F: net/rds/
5598READ-COPY UPDATE (RCU) 5598READ-COPY UPDATE (RCU)
5599M: Dipankar Sarma <dipankar@in.ibm.com> 5599M: Dipankar Sarma <dipankar@in.ibm.com>
5600M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> 5600M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
5601W: http://www.rdrop.com/users/paulmck/rclock/ 5601W: http://www.rdrop.com/users/paulmck/RCU/
5602S: Supported 5602S: Supported
5603T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git 5603T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
5604F: Documentation/RCU/ 5604F: Documentation/RCU/
5605X: Documentation/RCU/torture.txt
5605F: include/linux/rcu* 5606F: include/linux/rcu*
5606F: include/linux/srcu*
5607F: kernel/rcu* 5607F: kernel/rcu*
5608F: kernel/srcu*
5609X: kernel/rcutorture.c 5608X: kernel/rcutorture.c
5610 5609
5611REAL TIME CLOCK (RTC) SUBSYSTEM 5610REAL TIME CLOCK (RTC) SUBSYSTEM
@@ -6122,6 +6121,15 @@ S: Maintained
6122F: include/linux/sl?b*.h 6121F: include/linux/sl?b*.h
6123F: mm/sl?b.c 6122F: mm/sl?b.c
6124 6123
6124SLEEPABLE READ-COPY UPDATE (SRCU)
6125M: Lai Jiangshan <laijs@cn.fujitsu.com>
6126M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
6127W: http://www.rdrop.com/users/paulmck/RCU/
6128S: Supported
6129T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
6130F: include/linux/srcu*
6131F: kernel/srcu*
6132
6125SMC91x ETHERNET DRIVER 6133SMC91x ETHERNET DRIVER
6126M: Nicolas Pitre <nico@fluxnic.net> 6134M: Nicolas Pitre <nico@fluxnic.net>
6127S: Odd Fixes 6135S: Odd Fixes
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 43b39d61b538..88e466b159dc 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -705,6 +705,7 @@ static void stack_proc(void *arg)
705 struct task_struct *from = current, *to = arg; 705 struct task_struct *from = current, *to = arg;
706 706
707 to->thread.saved_task = from; 707 to->thread.saved_task = from;
708 rcu_switch_from(from);
708 switch_to(from, to, from); 709 switch_to(from, to, from);
709} 710}
710 711
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index d079290843a9..e0f0fab20415 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -30,6 +30,7 @@
30 * This is only for internal list manipulation where we know 30 * This is only for internal list manipulation where we know
31 * the prev/next entries already! 31 * the prev/next entries already!
32 */ 32 */
33#ifndef CONFIG_DEBUG_LIST
33static inline void __list_add_rcu(struct list_head *new, 34static inline void __list_add_rcu(struct list_head *new,
34 struct list_head *prev, struct list_head *next) 35 struct list_head *prev, struct list_head *next)
35{ 36{
@@ -38,6 +39,10 @@ static inline void __list_add_rcu(struct list_head *new,
38 rcu_assign_pointer(list_next_rcu(prev), new); 39 rcu_assign_pointer(list_next_rcu(prev), new);
39 next->prev = new; 40 next->prev = new;
40} 41}
42#else
43extern void __list_add_rcu(struct list_head *new,
44 struct list_head *prev, struct list_head *next);
45#endif
41 46
42/** 47/**
43 * list_add_rcu - add a new entry to rcu-protected list 48 * list_add_rcu - add a new entry to rcu-protected list
@@ -108,7 +113,7 @@ static inline void list_add_tail_rcu(struct list_head *new,
108 */ 113 */
109static inline void list_del_rcu(struct list_head *entry) 114static inline void list_del_rcu(struct list_head *entry)
110{ 115{
111 __list_del(entry->prev, entry->next); 116 __list_del_entry(entry);
112 entry->prev = LIST_POISON2; 117 entry->prev = LIST_POISON2;
113} 118}
114 119
@@ -228,18 +233,43 @@ static inline void list_splice_init_rcu(struct list_head *list,
228 }) 233 })
229 234
230/** 235/**
231 * list_first_entry_rcu - get the first element from a list 236 * Where are list_empty_rcu() and list_first_entry_rcu()?
237 *
238 * Implementing those functions following their counterparts list_empty() and
239 * list_first_entry() is not advisable because they lead to subtle race
240 * conditions as the following snippet shows:
241 *
242 * if (!list_empty_rcu(mylist)) {
243 * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
244 * do_something(bar);
245 * }
246 *
247 * The list may not be empty when list_empty_rcu checks it, but it may be when
248 * list_first_entry_rcu rereads the ->next pointer.
249 *
250 * Rereading the ->next pointer is not a problem for list_empty() and
251 * list_first_entry() because they would be protected by a lock that blocks
252 * writers.
253 *
254 * See list_first_or_null_rcu for an alternative.
255 */
256
257/**
258 * list_first_or_null_rcu - get the first element from a list
232 * @ptr: the list head to take the element from. 259 * @ptr: the list head to take the element from.
233 * @type: the type of the struct this is embedded in. 260 * @type: the type of the struct this is embedded in.
234 * @member: the name of the list_struct within the struct. 261 * @member: the name of the list_struct within the struct.
235 * 262 *
236 * Note, that list is expected to be not empty. 263 * Note that if the list is empty, it returns NULL.
237 * 264 *
238 * This primitive may safely run concurrently with the _rcu list-mutation 265 * This primitive may safely run concurrently with the _rcu list-mutation
239 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). 266 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
240 */ 267 */
241#define list_first_entry_rcu(ptr, type, member) \ 268#define list_first_or_null_rcu(ptr, type, member) \
242 list_entry_rcu((ptr)->next, type, member) 269 ({struct list_head *__ptr = (ptr); \
270 struct list_head __rcu *__next = list_next_rcu(__ptr); \
271 likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \
272 })
243 273
244/** 274/**
245 * list_for_each_entry_rcu - iterate over rcu list of given type 275 * list_for_each_entry_rcu - iterate over rcu list of given type
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 20fb776a1d4a..26d1a47591f1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -184,12 +184,14 @@ static inline int rcu_preempt_depth(void)
184/* Internal to kernel */ 184/* Internal to kernel */
185extern void rcu_sched_qs(int cpu); 185extern void rcu_sched_qs(int cpu);
186extern void rcu_bh_qs(int cpu); 186extern void rcu_bh_qs(int cpu);
187extern void rcu_preempt_note_context_switch(void);
187extern void rcu_check_callbacks(int cpu, int user); 188extern void rcu_check_callbacks(int cpu, int user);
188struct notifier_block; 189struct notifier_block;
189extern void rcu_idle_enter(void); 190extern void rcu_idle_enter(void);
190extern void rcu_idle_exit(void); 191extern void rcu_idle_exit(void);
191extern void rcu_irq_enter(void); 192extern void rcu_irq_enter(void);
192extern void rcu_irq_exit(void); 193extern void rcu_irq_exit(void);
194extern void exit_rcu(void);
193 195
194/** 196/**
195 * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers 197 * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers
@@ -922,6 +924,21 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset)
922 kfree_call_rcu(head, (rcu_callback)offset); 924 kfree_call_rcu(head, (rcu_callback)offset);
923} 925}
924 926
927/*
928 * Does the specified offset indicate that the corresponding rcu_head
929 * structure can be handled by kfree_rcu()?
930 */
931#define __is_kfree_rcu_offset(offset) ((offset) < 4096)
932
933/*
934 * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
935 */
936#define __kfree_rcu(head, offset) \
937 do { \
938 BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
939 call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \
940 } while (0)
941
925/** 942/**
926 * kfree_rcu() - kfree an object after a grace period. 943 * kfree_rcu() - kfree an object after a grace period.
927 * @ptr: pointer to kfree 944 * @ptr: pointer to kfree
@@ -944,6 +961,9 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset)
944 * 961 *
945 * Note that the allowable offset might decrease in the future, for example, 962 * Note that the allowable offset might decrease in the future, for example,
946 * to allow something like kmem_cache_free_rcu(). 963 * to allow something like kmem_cache_free_rcu().
964 *
965 * The BUILD_BUG_ON check must not involve any function calls, hence the
966 * checks are done in macros here.
947 */ 967 */
948#define kfree_rcu(ptr, rcu_head) \ 968#define kfree_rcu(ptr, rcu_head) \
949 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) 969 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e93df77176d1..adb5e5a38cae 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -87,14 +87,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
87 87
88#ifdef CONFIG_TINY_RCU 88#ifdef CONFIG_TINY_RCU
89 89
90static inline void rcu_preempt_note_context_switch(void)
91{
92}
93
94static inline void exit_rcu(void)
95{
96}
97
98static inline int rcu_needs_cpu(int cpu) 90static inline int rcu_needs_cpu(int cpu)
99{ 91{
100 return 0; 92 return 0;
@@ -102,8 +94,6 @@ static inline int rcu_needs_cpu(int cpu)
102 94
103#else /* #ifdef CONFIG_TINY_RCU */ 95#else /* #ifdef CONFIG_TINY_RCU */
104 96
105void rcu_preempt_note_context_switch(void);
106extern void exit_rcu(void);
107int rcu_preempt_needs_cpu(void); 97int rcu_preempt_needs_cpu(void);
108 98
109static inline int rcu_needs_cpu(int cpu) 99static inline int rcu_needs_cpu(int cpu)
@@ -116,7 +106,6 @@ static inline int rcu_needs_cpu(int cpu)
116static inline void rcu_note_context_switch(int cpu) 106static inline void rcu_note_context_switch(int cpu)
117{ 107{
118 rcu_sched_qs(cpu); 108 rcu_sched_qs(cpu);
119 rcu_preempt_note_context_switch();
120} 109}
121 110
122/* 111/*
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index e8ee5dd0854c..3c6083cde4fc 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -45,18 +45,6 @@ static inline void rcu_virt_note_context_switch(int cpu)
45 rcu_note_context_switch(cpu); 45 rcu_note_context_switch(cpu);
46} 46}
47 47
48#ifdef CONFIG_TREE_PREEMPT_RCU
49
50extern void exit_rcu(void);
51
52#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
53
54static inline void exit_rcu(void)
55{
56}
57
58#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
59
60extern void synchronize_rcu_bh(void); 48extern void synchronize_rcu_bh(void);
61extern void synchronize_sched_expedited(void); 49extern void synchronize_sched_expedited(void);
62extern void synchronize_rcu_expedited(void); 50extern void synchronize_rcu_expedited(void);
@@ -98,13 +86,6 @@ extern void rcu_force_quiescent_state(void);
98extern void rcu_bh_force_quiescent_state(void); 86extern void rcu_bh_force_quiescent_state(void);
99extern void rcu_sched_force_quiescent_state(void); 87extern void rcu_sched_force_quiescent_state(void);
100 88
101/* A context switch is a grace period for RCU-sched and RCU-bh. */
102static inline int rcu_blocking_is_gp(void)
103{
104 might_sleep(); /* Check for RCU read-side critical section. */
105 return num_online_cpus() == 1;
106}
107
108extern void rcu_scheduler_starting(void); 89extern void rcu_scheduler_starting(void);
109extern int rcu_scheduler_active __read_mostly; 90extern int rcu_scheduler_active __read_mostly;
110 91
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81a173c0897d..8f3fd945070f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1905,12 +1905,22 @@ static inline void rcu_copy_process(struct task_struct *p)
1905 INIT_LIST_HEAD(&p->rcu_node_entry); 1905 INIT_LIST_HEAD(&p->rcu_node_entry);
1906} 1906}
1907 1907
1908static inline void rcu_switch_from(struct task_struct *prev)
1909{
1910 if (prev->rcu_read_lock_nesting != 0)
1911 rcu_preempt_note_context_switch();
1912}
1913
1908#else 1914#else
1909 1915
1910static inline void rcu_copy_process(struct task_struct *p) 1916static inline void rcu_copy_process(struct task_struct *p)
1911{ 1917{
1912} 1918}
1913 1919
1920static inline void rcu_switch_from(struct task_struct *prev)
1921{
1922}
1923
1914#endif 1924#endif
1915 1925
1916#ifdef CONFIG_SMP 1926#ifdef CONFIG_SMP
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index d3d5fa54f25e..55a5c52cbb25 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -29,26 +29,35 @@
29 29
30#include <linux/mutex.h> 30#include <linux/mutex.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/workqueue.h>
32 33
33struct srcu_struct_array { 34struct srcu_struct_array {
34 int c[2]; 35 unsigned long c[2];
36 unsigned long seq[2];
37};
38
39struct rcu_batch {
40 struct rcu_head *head, **tail;
35}; 41};
36 42
37struct srcu_struct { 43struct srcu_struct {
38 int completed; 44 unsigned completed;
39 struct srcu_struct_array __percpu *per_cpu_ref; 45 struct srcu_struct_array __percpu *per_cpu_ref;
40 struct mutex mutex; 46 spinlock_t queue_lock; /* protect ->batch_queue, ->running */
47 bool running;
48 /* callbacks just queued */
49 struct rcu_batch batch_queue;
50 /* callbacks try to do the first check_zero */
51 struct rcu_batch batch_check0;
52 /* callbacks done with the first check_zero and the flip */
53 struct rcu_batch batch_check1;
54 struct rcu_batch batch_done;
55 struct delayed_work work;
41#ifdef CONFIG_DEBUG_LOCK_ALLOC 56#ifdef CONFIG_DEBUG_LOCK_ALLOC
42 struct lockdep_map dep_map; 57 struct lockdep_map dep_map;
43#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
44}; 59};
45 60
46#ifndef CONFIG_PREEMPT
47#define srcu_barrier() barrier()
48#else /* #ifndef CONFIG_PREEMPT */
49#define srcu_barrier()
50#endif /* #else #ifndef CONFIG_PREEMPT */
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 61#ifdef CONFIG_DEBUG_LOCK_ALLOC
53 62
54int __init_srcu_struct(struct srcu_struct *sp, const char *name, 63int __init_srcu_struct(struct srcu_struct *sp, const char *name,
@@ -67,12 +76,33 @@ int init_srcu_struct(struct srcu_struct *sp);
67 76
68#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 77#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
69 78
79/**
80 * call_srcu() - Queue a callback for invocation after an SRCU grace period
81 * @sp: srcu_struct in queue the callback
82 * @head: structure to be used for queueing the SRCU callback.
83 * @func: function to be invoked after the SRCU grace period
84 *
85 * The callback function will be invoked some time after a full SRCU
86 * grace period elapses, in other words after all pre-existing SRCU
87 * read-side critical sections have completed. However, the callback
88 * function might well execute concurrently with other SRCU read-side
89 * critical sections that started after call_srcu() was invoked. SRCU
90 * read-side critical sections are delimited by srcu_read_lock() and
91 * srcu_read_unlock(), and may be nested.
92 *
93 * The callback will be invoked from process context, but must nevertheless
94 * be fast and must not block.
95 */
96void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
97 void (*func)(struct rcu_head *head));
98
70void cleanup_srcu_struct(struct srcu_struct *sp); 99void cleanup_srcu_struct(struct srcu_struct *sp);
71int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); 100int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
72void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); 101void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
73void synchronize_srcu(struct srcu_struct *sp); 102void synchronize_srcu(struct srcu_struct *sp);
74void synchronize_srcu_expedited(struct srcu_struct *sp); 103void synchronize_srcu_expedited(struct srcu_struct *sp);
75long srcu_batches_completed(struct srcu_struct *sp); 104long srcu_batches_completed(struct srcu_struct *sp);
105void srcu_barrier(struct srcu_struct *sp);
76 106
77#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
78 108
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 337099783f37..1480900c511c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -292,6 +292,8 @@ TRACE_EVENT(rcu_dyntick,
292 * "More callbacks": Still more callbacks, try again to clear them out. 292 * "More callbacks": Still more callbacks, try again to clear them out.
293 * "Callbacks drained": All callbacks processed, off to dyntick idle! 293 * "Callbacks drained": All callbacks processed, off to dyntick idle!
294 * "Timer": Timer fired to cause CPU to continue processing callbacks. 294 * "Timer": Timer fired to cause CPU to continue processing callbacks.
295 * "Demigrate": Timer fired on wrong CPU, woke up correct CPU.
296 * "Cleanup after idle": Idle exited, timer canceled.
295 */ 297 */
296TRACE_EVENT(rcu_prep_idle, 298TRACE_EVENT(rcu_prep_idle,
297 299
diff --git a/init/Kconfig b/init/Kconfig
index 6cfd71d06463..6d18ef8071b5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -458,6 +458,33 @@ config RCU_FANOUT
458 Select a specific number if testing RCU itself. 458 Select a specific number if testing RCU itself.
459 Take the default if unsure. 459 Take the default if unsure.
460 460
461config RCU_FANOUT_LEAF
462 int "Tree-based hierarchical RCU leaf-level fanout value"
463 range 2 RCU_FANOUT if 64BIT
464 range 2 RCU_FANOUT if !64BIT
465 depends on TREE_RCU || TREE_PREEMPT_RCU
466 default 16
467 help
468 This option controls the leaf-level fanout of hierarchical
469 implementations of RCU, and allows trading off cache misses
470 against lock contention. Systems that synchronize their
471 scheduling-clock interrupts for energy-efficiency reasons will
472 want the default because the smaller leaf-level fanout keeps
473 lock contention levels acceptably low. Very large systems
474 (hundreds or thousands of CPUs) will instead want to set this
475 value to the maximum value possible in order to reduce the
476 number of cache misses incurred during RCU's grace-period
477 initialization. These systems tend to run CPU-bound, and thus
478 are not helped by synchronized interrupts, and thus tend to
479 skew them, which reduces lock contention enough that large
480 leaf-level fanouts work well.
481
482 Select a specific number if testing RCU itself.
483
484 Select the maximum permissible value for large systems.
485
486 Take the default if unsure.
487
461config RCU_FANOUT_EXACT 488config RCU_FANOUT_EXACT
462 bool "Disable tree-based hierarchical RCU auto-balancing" 489 bool "Disable tree-based hierarchical RCU auto-balancing"
463 depends on TREE_RCU || TREE_PREEMPT_RCU 490 depends on TREE_RCU || TREE_PREEMPT_RCU
@@ -515,10 +542,25 @@ config RCU_BOOST_PRIO
515 depends on RCU_BOOST 542 depends on RCU_BOOST
516 default 1 543 default 1
517 help 544 help
518 This option specifies the real-time priority to which preempted 545 This option specifies the real-time priority to which long-term
519 RCU readers are to be boosted. If you are working with CPU-bound 546 preempted RCU readers are to be boosted. If you are working
520 real-time applications, you should specify a priority higher then 547 with a real-time application that has one or more CPU-bound
521 the highest-priority CPU-bound application. 548 threads running at a real-time priority level, you should set
549 RCU_BOOST_PRIO to a priority higher then the highest-priority
550 real-time CPU-bound thread. The default RCU_BOOST_PRIO value
551 of 1 is appropriate in the common case, which is real-time
552 applications that do not have any CPU-bound threads.
553
554 Some real-time applications might not have a single real-time
555 thread that saturates a given CPU, but instead might have
556 multiple real-time threads that, taken together, fully utilize
557 that CPU. In this case, you should set RCU_BOOST_PRIO to
558 a priority higher than the lowest-priority thread that is
559 conspiring to prevent the CPU from running any non-real-time
560 tasks. For example, if one thread at priority 10 and another
561 thread at priority 5 are between themselves fully consuming
562 the CPU time on a given CPU, then RCU_BOOST_PRIO should be
563 set to priority 6 or higher.
522 564
523 Specify the real-time priority, or take the default if unsure. 565 Specify the real-time priority, or take the default if unsure.
524 566
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1823 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1824 1879
1825 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1826 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1827 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1828 rdp->qlen++; 1881 rdp->qlen++;
1829 if (lazy) 1882 if (lazy)
1830 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1831 1889
1832 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1833 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1893} 1951}
1894EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1895 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1896/** 1986/**
1897 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1898 * 1988 *
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2166 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2167} 2257}
2168 2258
2169static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2170static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2171static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2172static struct completion rcu_barrier_completion; 2262 */
2173
2174static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2175{ 2264{
2176 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2200 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2201 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2202{ 2291{
2203 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2204 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2205 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2206 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2207 /* 2304 /*
2208 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2209 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2210 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2211 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2212 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2213 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2214 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2215 * any CPUs from coming online or going offline until each online 2312 *
2216 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2217 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2218 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2219 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2220 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2221 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2222 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2223 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2224} 2380}
2225 2381
2226/** 2382/**
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2417 2573
2418 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2419 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2420 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2421} 2577}
2422#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2423static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..eb4131b8ad60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2083,6 +2083,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2083#endif
2084 2084
2085 /* Here we just switch the register state and the stack. */ 2085 /* Here we just switch the register state and the stack. */
2086 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2087 switch_to(prev, next, prev);
2087 2088
2088 barrier(); 2089 barrier();
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..837c552fe838 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 982b850d4e7a..3810b481f940 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -10,6 +10,7 @@
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/bug.h> 11#include <linux/bug.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/rculist.h>
13 14
14/* 15/*
15 * Insert a new entry between two known consecutive entries. 16 * Insert a new entry between two known consecutive entries.
@@ -75,3 +76,24 @@ void list_del(struct list_head *entry)
75 entry->prev = LIST_POISON2; 76 entry->prev = LIST_POISON2;
76} 77}
77EXPORT_SYMBOL(list_del); 78EXPORT_SYMBOL(list_del);
79
80/*
81 * RCU variants.
82 */
83void __list_add_rcu(struct list_head *new,
84 struct list_head *prev, struct list_head *next)
85{
86 WARN(next->prev != prev,
87 "list_add_rcu corruption. next->prev should be "
88 "prev (%p), but was %p. (next=%p).\n",
89 prev, next->prev, next);
90 WARN(prev->next != next,
91 "list_add_rcu corruption. prev->next should be "
92 "next (%p), but was %p. (prev=%p).\n",
93 next, prev->next, prev);
94 new->next = next;
95 new->prev = prev;
96 rcu_assign_pointer(list_next_rcu(prev), new);
97 next->prev = new;
98}
99EXPORT_SYMBOL(__list_add_rcu);