aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-21 22:26:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-21 22:26:51 -0400
commit226da0dbc84ed97f448523e2a4cb91c27fa68ed9 (patch)
tree3969a9f612cd5596747ecde2066e65eacbab7d2e /kernel
parent5ec29e3149d800e6db83c1b6ff441daf319cbbe2 (diff)
parent2d84e023cb5ec00403ff5d447533c6fd58fcc7ff (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU changes from Ingo Molnar: "This is the v3.5 RCU tree from Paul E. McKenney: 1) A set of improvements and fixes to the RCU_FAST_NO_HZ feature (with more on the way for 3.6). Posted to LKML: https://lkml.org/lkml/2012/4/23/324 (commits 1-3 and 5), https://lkml.org/lkml/2012/4/16/611 (commit 4), https://lkml.org/lkml/2012/4/30/390 (commit 6), and https://lkml.org/lkml/2012/5/4/410 (commit 7, combined with the other commits for the convenience of the tester). 2) Changes to make rcu_barrier() avoid disrupting execution of CPUs that have no RCU callbacks. Posted to LKML: https://lkml.org/lkml/2012/4/23/322. 3) A couple of commits that improve the efficiency of the interaction between preemptible RCU and the scheduler, these two being all that survived an abortive attempt to allow preemptible RCU's __rcu_read_lock() to be inlined. The full set was posted to LKML at https://lkml.org/lkml/2012/4/14/143, and the first and third patches of that set remain. 4) Lai Jiangshan's algorithmic implementation of SRCU, which includes call_srcu() and srcu_barrier(). A major feature of this new implementation is that synchronize_srcu() no longer disturbs the execution of other CPUs. This work is based on earlier implementations by Peter Zijlstra and Paul E. McKenney. Posted to LKML: https://lkml.org/lkml/2012/2/22/82. 5) A number of miscellaneous bug fixes and improvements which were posted to LKML at: https://lkml.org/lkml/2012/4/23/353 with subsequent updates posted to LKML." * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) rcu: Make rcu_barrier() less disruptive rcu: Explicitly initialize RCU_FAST_NO_HZ per-CPU variables rcu: Make RCU_FAST_NO_HZ handle timer migration rcu: Update RCU maintainership rcu: Make exit_rcu() more precise and consolidate rcu: Move PREEMPT_RCU preemption to switch_to() invocation rcu: Ensure that RCU_FAST_NO_HZ timers expire on correct CPU rcu: Add rcutorture test for call_srcu() rcu: Implement per-domain single-threaded call_srcu() state machine rcu: Use single value to handle expedited SRCU grace periods rcu: Improve srcu_readers_active_idx()'s cache locality rcu: Remove unused srcu_barrier() rcu: Implement a variant of Peter's SRCU algorithm rcu: Improve SRCU's wait_idx() comments rcu: Flip ->completed only once per SRCU grace period rcu: Increment upper bit only for srcu_read_lock() rcu: Remove fast check path from __synchronize_srcu() rcu: Direct algorithmic SRCU implementation rcu: Introduce rcutorture testing for rcu_barrier() timer: Fix mod_timer_pinned() header comment ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/timer.c8
10 files changed, 1073 insertions, 298 deletions
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1823 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1824 1879
1825 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1826 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1827 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1828 rdp->qlen++; 1881 rdp->qlen++;
1829 if (lazy) 1882 if (lazy)
1830 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1831 1889
1832 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1833 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1893} 1951}
1894EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1895 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1896/** 1986/**
1897 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1898 * 1988 *
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2166 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2167} 2257}
2168 2258
2169static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2170static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2171static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2172static struct completion rcu_barrier_completion; 2262 */
2173
2174static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2175{ 2264{
2176 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2200 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2201 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2202{ 2291{
2203 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2204 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2205 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2206 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2207 /* 2304 /*
2208 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2209 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2210 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2211 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2212 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2213 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2214 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2215 * any CPUs from coming online or going offline until each online 2312 *
2216 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2217 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2218 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2219 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2220 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2221 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2222 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2223 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2224} 2380}
2225 2381
2226/** 2382/**
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2417 2573
2418 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2419 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2420 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2421} 2577}
2422#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2423static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..eb4131b8ad60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2083,6 +2083,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2083#endif
2084 2084
2085 /* Here we just switch the register state and the stack. */ 2085 /* Here we just switch the register state and the stack. */
2086 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2087 switch_to(prev, next, prev);
2087 2088
2088 barrier(); 2089 barrier();
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..837c552fe838 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *