aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/rcuclassic.h3
-rw-r--r--include/linux/rcupdate.h22
-rw-r--r--include/linux/rcupreempt.h42
-rw-r--r--init/main.c1
-rw-r--r--kernel/rcupdate.c20
-rw-r--r--kernel/rcupreempt.c414
6 files changed, 434 insertions, 68 deletions
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index b3aa05baab8a..8c774905dcfe 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -151,7 +151,10 @@ extern struct lockdep_map rcu_lock_map;
151 151
152#define __synchronize_sched() synchronize_rcu() 152#define __synchronize_sched() synchronize_rcu()
153 153
154#define call_rcu_sched(head, func) call_rcu(head, func)
155
154extern void __rcu_init(void); 156extern void __rcu_init(void);
157#define rcu_init_sched() do { } while (0)
155extern void rcu_check_callbacks(int cpu, int user); 158extern void rcu_check_callbacks(int cpu, int user);
156extern void rcu_restart_cpu(int cpu); 159extern void rcu_restart_cpu(int cpu);
157 160
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ec2fc5b32646..411969cb5243 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,6 +40,7 @@
40#include <linux/cpumask.h> 40#include <linux/cpumask.h>
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/completion.h>
43 44
44/** 45/**
45 * struct rcu_head - callback structure for use with RCU 46 * struct rcu_head - callback structure for use with RCU
@@ -168,6 +169,27 @@ struct rcu_head {
168 (p) = (v); \ 169 (p) = (v); \
169 }) 170 })
170 171
172/* Infrastructure to implement the synchronize_() primitives. */
173
174struct rcu_synchronize {
175 struct rcu_head head;
176 struct completion completion;
177};
178
179extern void wakeme_after_rcu(struct rcu_head *head);
180
181#define synchronize_rcu_xxx(name, func) \
182void name(void) \
183{ \
184 struct rcu_synchronize rcu; \
185 \
186 init_completion(&rcu.completion); \
187 /* Will wake me after RCU finished. */ \
188 func(&rcu.head, wakeme_after_rcu); \
189 /* Wait for it. */ \
190 wait_for_completion(&rcu.completion); \
191}
192
171/** 193/**
172 * synchronize_sched - block until all CPUs have exited any non-preemptive 194 * synchronize_sched - block until all CPUs have exited any non-preemptive
173 * kernel code sequences. 195 * kernel code sequences.
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 8a05c7e20bc4..f04b64eca636 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -40,10 +40,39 @@
40#include <linux/cpumask.h> 40#include <linux/cpumask.h>
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42 42
43#define rcu_qsctr_inc(cpu) 43struct rcu_dyntick_sched {
44 int dynticks;
45 int dynticks_snap;
46 int sched_qs;
47 int sched_qs_snap;
48 int sched_dynticks_snap;
49};
50
51DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
52
53static inline void rcu_qsctr_inc(int cpu)
54{
55 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
56
57 rdssp->sched_qs++;
58}
44#define rcu_bh_qsctr_inc(cpu) 59#define rcu_bh_qsctr_inc(cpu)
45#define call_rcu_bh(head, rcu) call_rcu(head, rcu) 60#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
46 61
62/**
63 * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
64 * @head: structure to be used for queueing the RCU updates.
65 * @func: actual update function to be invoked after the grace period
66 *
67 * The update function will be invoked some time after a full
68 * synchronize_sched()-style grace period elapses, in other words after
69 * all currently executing preempt-disabled sections of code (including
70 * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
71 * completed.
72 */
73extern void call_rcu_sched(struct rcu_head *head,
74 void (*func)(struct rcu_head *head));
75
47extern void __rcu_read_lock(void) __acquires(RCU); 76extern void __rcu_read_lock(void) __acquires(RCU);
48extern void __rcu_read_unlock(void) __releases(RCU); 77extern void __rcu_read_unlock(void) __releases(RCU);
49extern int rcu_pending(int cpu); 78extern int rcu_pending(int cpu);
@@ -55,6 +84,7 @@ extern int rcu_needs_cpu(int cpu);
55extern void __synchronize_sched(void); 84extern void __synchronize_sched(void);
56 85
57extern void __rcu_init(void); 86extern void __rcu_init(void);
87extern void rcu_init_sched(void);
58extern void rcu_check_callbacks(int cpu, int user); 88extern void rcu_check_callbacks(int cpu, int user);
59extern void rcu_restart_cpu(int cpu); 89extern void rcu_restart_cpu(int cpu);
60extern long rcu_batches_completed(void); 90extern long rcu_batches_completed(void);
@@ -81,20 +111,20 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
81struct softirq_action; 111struct softirq_action;
82 112
83#ifdef CONFIG_NO_HZ 113#ifdef CONFIG_NO_HZ
84DECLARE_PER_CPU(long, dynticks_progress_counter); 114DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
85 115
86static inline void rcu_enter_nohz(void) 116static inline void rcu_enter_nohz(void)
87{ 117{
88 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 118 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
89 __get_cpu_var(dynticks_progress_counter)++; 119 __get_cpu_var(rcu_dyntick_sched).dynticks++;
90 WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1); 120 WARN_ON(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1);
91} 121}
92 122
93static inline void rcu_exit_nohz(void) 123static inline void rcu_exit_nohz(void)
94{ 124{
95 __get_cpu_var(dynticks_progress_counter)++;
96 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 125 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
97 WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1)); 126 __get_cpu_var(rcu_dyntick_sched).dynticks++;
127 WARN_ON(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1));
98} 128}
99 129
100#else /* CONFIG_NO_HZ */ 130#else /* CONFIG_NO_HZ */
diff --git a/init/main.c b/init/main.c
index f7fb20021d48..a9cc3e0803de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -758,6 +758,7 @@ static void __init do_initcalls(void)
758 */ 758 */
759static void __init do_basic_setup(void) 759static void __init do_basic_setup(void)
760{ 760{
761 rcu_init_sched(); /* needed by module_init stage. */
761 /* drivers will send hotplug events */ 762 /* drivers will send hotplug events */
762 init_workqueues(); 763 init_workqueues();
763 usermodehelper_init(); 764 usermodehelper_init();
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..a4e329d92883 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,18 +39,12 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <asm/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/completion.h>
43#include <linux/percpu.h> 42#include <linux/percpu.h>
44#include <linux/notifier.h> 43#include <linux/notifier.h>
45#include <linux/cpu.h> 44#include <linux/cpu.h>
46#include <linux/mutex.h> 45#include <linux/mutex.h>
47#include <linux/module.h> 46#include <linux/module.h>
48 47
49struct rcu_synchronize {
50 struct rcu_head head;
51 struct completion completion;
52};
53
54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 48static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
55static atomic_t rcu_barrier_cpu_count; 49static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 50static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -60,7 +54,7 @@ static struct completion rcu_barrier_completion;
60 * Awaken the corresponding synchronize_rcu() instance now that a 54 * Awaken the corresponding synchronize_rcu() instance now that a
61 * grace period has elapsed. 55 * grace period has elapsed.
62 */ 56 */
63static void wakeme_after_rcu(struct rcu_head *head) 57void wakeme_after_rcu(struct rcu_head *head)
64{ 58{
65 struct rcu_synchronize *rcu; 59 struct rcu_synchronize *rcu;
66 60
@@ -77,17 +71,7 @@ static void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 71 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 72 * and may be nested.
79 */ 73 */
80void synchronize_rcu(void) 74synchronize_rcu_xxx(synchronize_rcu, call_rcu)
81{
82 struct rcu_synchronize rcu;
83
84 init_completion(&rcu.completion);
85 /* Will wake me after RCU finished */
86 call_rcu(&rcu.head, wakeme_after_rcu);
87
88 /* Wait for it */
89 wait_for_completion(&rcu.completion);
90}
91EXPORT_SYMBOL_GPL(synchronize_rcu); 75EXPORT_SYMBOL_GPL(synchronize_rcu);
92 76
93static void rcu_barrier_callback(struct rcu_head *notused) 77static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5e02b7740702..aaa7976bd85f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/bitops.h> 47#include <linux/bitops.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/kthread.h>
49#include <linux/completion.h> 50#include <linux/completion.h>
50#include <linux/moduleparam.h> 51#include <linux/moduleparam.h>
51#include <linux/percpu.h> 52#include <linux/percpu.h>
@@ -87,9 +88,14 @@ struct rcu_data {
87 struct rcu_head **nexttail; 88 struct rcu_head **nexttail;
88 struct rcu_head *waitlist[GP_STAGES]; 89 struct rcu_head *waitlist[GP_STAGES];
89 struct rcu_head **waittail[GP_STAGES]; 90 struct rcu_head **waittail[GP_STAGES];
90 struct rcu_head *donelist; 91 struct rcu_head *donelist; /* from waitlist & waitschedlist */
91 struct rcu_head **donetail; 92 struct rcu_head **donetail;
92 long rcu_flipctr[2]; 93 long rcu_flipctr[2];
94 struct rcu_head *nextschedlist;
95 struct rcu_head **nextschedtail;
96 struct rcu_head *waitschedlist;
97 struct rcu_head **waitschedtail;
98 int rcu_sched_sleeping;
93#ifdef CONFIG_RCU_TRACE 99#ifdef CONFIG_RCU_TRACE
94 struct rcupreempt_trace trace; 100 struct rcupreempt_trace trace;
95#endif /* #ifdef CONFIG_RCU_TRACE */ 101#endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +137,24 @@ enum rcu_try_flip_states {
131 rcu_try_flip_waitmb_state, 137 rcu_try_flip_waitmb_state,
132}; 138};
133 139
140/*
141 * States for rcu_ctrlblk.rcu_sched_sleep.
142 */
143
144enum rcu_sched_sleep_states {
145 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
146 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
147 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
148};
149
134struct rcu_ctrlblk { 150struct rcu_ctrlblk {
135 spinlock_t fliplock; /* Protect state-machine transitions. */ 151 spinlock_t fliplock; /* Protect state-machine transitions. */
136 long completed; /* Number of last completed batch. */ 152 long completed; /* Number of last completed batch. */
137 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of 153 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
138 the rcu state machine */ 154 the rcu state machine */
155 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
156 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
157 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
139}; 158};
140 159
141static DEFINE_PER_CPU(struct rcu_data, rcu_data); 160static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +162,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
143 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 162 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
144 .completed = 0, 163 .completed = 0,
145 .rcu_try_flip_state = rcu_try_flip_idle_state, 164 .rcu_try_flip_state = rcu_try_flip_idle_state,
165 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
166 .sched_sleep = rcu_sched_not_sleeping,
167 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
146}; 168};
147 169
170static struct task_struct *rcu_sched_grace_period_task;
148 171
149#ifdef CONFIG_RCU_TRACE 172#ifdef CONFIG_RCU_TRACE
150static char *rcu_try_flip_state_names[] = 173static char *rcu_try_flip_state_names[] =
@@ -207,6 +230,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
207 */ 230 */
208#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); 231#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
209 232
233#define RCU_SCHED_BATCH_TIME (HZ / 50)
234
210/* 235/*
211 * Return the number of RCU batches processed thus far. Useful 236 * Return the number of RCU batches processed thus far. Useful
212 * for debug and statistics. 237 * for debug and statistics.
@@ -411,32 +436,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
411 } 436 }
412} 437}
413 438
414#ifdef CONFIG_NO_HZ 439DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
440 .dynticks = 1,
441};
415 442
416DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; 443#ifdef CONFIG_NO_HZ
417static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
418static DEFINE_PER_CPU(int, rcu_update_flag); 444static DEFINE_PER_CPU(int, rcu_update_flag);
419 445
420/** 446/**
421 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. 447 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
422 * 448 *
423 * If the CPU was idle with dynamic ticks active, this updates the 449 * If the CPU was idle with dynamic ticks active, this updates the
424 * dynticks_progress_counter to let the RCU handling know that the 450 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
425 * CPU is active. 451 * CPU is active.
426 */ 452 */
427void rcu_irq_enter(void) 453void rcu_irq_enter(void)
428{ 454{
429 int cpu = smp_processor_id(); 455 int cpu = smp_processor_id();
456 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
430 457
431 if (per_cpu(rcu_update_flag, cpu)) 458 if (per_cpu(rcu_update_flag, cpu))
432 per_cpu(rcu_update_flag, cpu)++; 459 per_cpu(rcu_update_flag, cpu)++;
433 460
434 /* 461 /*
435 * Only update if we are coming from a stopped ticks mode 462 * Only update if we are coming from a stopped ticks mode
436 * (dynticks_progress_counter is even). 463 * (rcu_dyntick_sched.dynticks is even).
437 */ 464 */
438 if (!in_interrupt() && 465 if (!in_interrupt() &&
439 (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { 466 (rdssp->dynticks & 0x1) == 0) {
440 /* 467 /*
441 * The following might seem like we could have a race 468 * The following might seem like we could have a race
442 * with NMI/SMIs. But this really isn't a problem. 469 * with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +486,12 @@ void rcu_irq_enter(void)
459 * RCU read-side critical sections on this CPU would 486 * RCU read-side critical sections on this CPU would
460 * have already completed. 487 * have already completed.
461 */ 488 */
462 per_cpu(dynticks_progress_counter, cpu)++; 489 rdssp->dynticks++;
463 /* 490 /*
464 * The following memory barrier ensures that any 491 * The following memory barrier ensures that any
465 * rcu_read_lock() primitives in the irq handler 492 * rcu_read_lock() primitives in the irq handler
466 * are seen by other CPUs to follow the above 493 * are seen by other CPUs to follow the above
467 * increment to dynticks_progress_counter. This is 494 * increment to rcu_dyntick_sched.dynticks. This is
468 * required in order for other CPUs to correctly 495 * required in order for other CPUs to correctly
469 * determine when it is safe to advance the RCU 496 * determine when it is safe to advance the RCU
470 * grace-period state machine. 497 * grace-period state machine.
@@ -472,7 +499,7 @@ void rcu_irq_enter(void)
472 smp_mb(); /* see above block comment. */ 499 smp_mb(); /* see above block comment. */
473 /* 500 /*
474 * Since we can't determine the dynamic tick mode from 501 * Since we can't determine the dynamic tick mode from
475 * the dynticks_progress_counter after this routine, 502 * the rcu_dyntick_sched.dynticks after this routine,
476 * we use a second flag to acknowledge that we came 503 * we use a second flag to acknowledge that we came
477 * from an idle state with ticks stopped. 504 * from an idle state with ticks stopped.
478 */ 505 */
@@ -480,7 +507,7 @@ void rcu_irq_enter(void)
480 /* 507 /*
481 * If we take an NMI/SMI now, they will also increment 508 * If we take an NMI/SMI now, they will also increment
482 * the rcu_update_flag, and will not update the 509 * the rcu_update_flag, and will not update the
483 * dynticks_progress_counter on exit. That is for 510 * rcu_dyntick_sched.dynticks on exit. That is for
484 * this IRQ to do. 511 * this IRQ to do.
485 */ 512 */
486 } 513 }
@@ -490,12 +517,13 @@ void rcu_irq_enter(void)
490 * rcu_irq_exit - Called from exiting Hard irq context. 517 * rcu_irq_exit - Called from exiting Hard irq context.
491 * 518 *
492 * If the CPU was idle with dynamic ticks active, update the 519 * If the CPU was idle with dynamic ticks active, update the
493 * dynticks_progress_counter to put let the RCU handling be 520 * rcu_dyntick_sched.dynticks to put let the RCU handling be
494 * aware that the CPU is going back to idle with no ticks. 521 * aware that the CPU is going back to idle with no ticks.
495 */ 522 */
496void rcu_irq_exit(void) 523void rcu_irq_exit(void)
497{ 524{
498 int cpu = smp_processor_id(); 525 int cpu = smp_processor_id();
526 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
499 527
500 /* 528 /*
501 * rcu_update_flag is set if we interrupted the CPU 529 * rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +531,7 @@ void rcu_irq_exit(void)
503 * Once this occurs, we keep track of interrupt nesting 531 * Once this occurs, we keep track of interrupt nesting
504 * because a NMI/SMI could also come in, and we still 532 * because a NMI/SMI could also come in, and we still
505 * only want the IRQ that started the increment of the 533 * only want the IRQ that started the increment of the
506 * dynticks_progress_counter to be the one that modifies 534 * rcu_dyntick_sched.dynticks to be the one that modifies
507 * it on exit. 535 * it on exit.
508 */ 536 */
509 if (per_cpu(rcu_update_flag, cpu)) { 537 if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +543,29 @@ void rcu_irq_exit(void)
515 543
516 /* 544 /*
517 * If an NMI/SMI happens now we are still 545 * If an NMI/SMI happens now we are still
518 * protected by the dynticks_progress_counter being odd. 546 * protected by the rcu_dyntick_sched.dynticks being odd.
519 */ 547 */
520 548
521 /* 549 /*
522 * The following memory barrier ensures that any 550 * The following memory barrier ensures that any
523 * rcu_read_unlock() primitives in the irq handler 551 * rcu_read_unlock() primitives in the irq handler
524 * are seen by other CPUs to preceed the following 552 * are seen by other CPUs to preceed the following
525 * increment to dynticks_progress_counter. This 553 * increment to rcu_dyntick_sched.dynticks. This
526 * is required in order for other CPUs to determine 554 * is required in order for other CPUs to determine
527 * when it is safe to advance the RCU grace-period 555 * when it is safe to advance the RCU grace-period
528 * state machine. 556 * state machine.
529 */ 557 */
530 smp_mb(); /* see above block comment. */ 558 smp_mb(); /* see above block comment. */
531 per_cpu(dynticks_progress_counter, cpu)++; 559 rdssp->dynticks++;
532 WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); 560 WARN_ON(rdssp->dynticks & 0x1);
533 } 561 }
534} 562}
535 563
536static void dyntick_save_progress_counter(int cpu) 564static void dyntick_save_progress_counter(int cpu)
537{ 565{
538 per_cpu(rcu_dyntick_snapshot, cpu) = 566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
539 per_cpu(dynticks_progress_counter, cpu); 567
568 rdssp->dynticks_snap = rdssp->dynticks;
540} 569}
541 570
542static inline int 571static inline int
@@ -544,9 +573,10 @@ rcu_try_flip_waitack_needed(int cpu)
544{ 573{
545 long curr; 574 long curr;
546 long snap; 575 long snap;
576 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
547 577
548 curr = per_cpu(dynticks_progress_counter, cpu); 578 curr = rdssp->dynticks;
549 snap = per_cpu(rcu_dyntick_snapshot, cpu); 579 snap = rdssp->dynticks_snap;
550 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 580 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
551 581
552 /* 582 /*
@@ -580,9 +610,10 @@ rcu_try_flip_waitmb_needed(int cpu)
580{ 610{
581 long curr; 611 long curr;
582 long snap; 612 long snap;
613 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
583 614
584 curr = per_cpu(dynticks_progress_counter, cpu); 615 curr = rdssp->dynticks;
585 snap = per_cpu(rcu_dyntick_snapshot, cpu); 616 snap = rdssp->dynticks_snap;
586 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 617 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
587 618
588 /* 619 /*
@@ -609,14 +640,86 @@ rcu_try_flip_waitmb_needed(int cpu)
609 return 1; 640 return 1;
610} 641}
611 642
643static void dyntick_save_progress_counter_sched(int cpu)
644{
645 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
646
647 rdssp->sched_dynticks_snap = rdssp->dynticks;
648}
649
650static int rcu_qsctr_inc_needed_dyntick(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->sched_dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot be in the middle of an rcu_read_lock(), so
664 * the next rcu_read_lock() it executes must use the new value
665 * of the counter. Therefore, this CPU has been in a quiescent
666 * state the entire time, and we don't need to wait for it.
667 */
668
669 if ((curr == snap) && ((curr & 0x1) == 0))
670 return 0;
671
672 /*
673 * If the CPU passed through or entered a dynticks idle phase with
674 * no active irq handlers, then, as above, this CPU has already
675 * passed through a quiescent state.
676 */
677
678 if ((curr - snap) > 2 || (snap & 0x1) == 0)
679 return 0;
680
681 /* We need this CPU to go through a quiescent state. */
682
683 return 1;
684}
685
612#else /* !CONFIG_NO_HZ */ 686#else /* !CONFIG_NO_HZ */
613 687
614# define dyntick_save_progress_counter(cpu) do { } while (0) 688# define dyntick_save_progress_counter(cpu) do { } while (0)
615# define rcu_try_flip_waitack_needed(cpu) (1) 689# define rcu_try_flip_waitack_needed(cpu) (1)
616# define rcu_try_flip_waitmb_needed(cpu) (1) 690# define rcu_try_flip_waitmb_needed(cpu) (1)
691
692# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
693# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
617 694
618#endif /* CONFIG_NO_HZ */ 695#endif /* CONFIG_NO_HZ */
619 696
697static void save_qsctr_sched(int cpu)
698{
699 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
700
701 rdssp->sched_qs_snap = rdssp->sched_qs;
702}
703
704static inline int rcu_qsctr_inc_needed(int cpu)
705{
706 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
707
708 /*
709 * If there has been a quiescent state, no more need to wait
710 * on this CPU.
711 */
712
713 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
714 smp_mb(); /* force ordering with cpu entering schedule(). */
715 return 0;
716 }
717
718 /* We need this CPU to go through a quiescent state. */
719
720 return 1;
721}
722
620/* 723/*
621 * Get here when RCU is idle. Decide whether we need to 724 * Get here when RCU is idle. Decide whether we need to
622 * move out of idle state, and return non-zero if so. 725 * move out of idle state, and return non-zero if so.
@@ -819,6 +922,26 @@ void rcu_check_callbacks(int cpu, int user)
819 unsigned long flags; 922 unsigned long flags;
820 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 923 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
821 924
925 /*
926 * If this CPU took its interrupt from user mode or from the
927 * idle loop, and this is not a nested interrupt, then
928 * this CPU has to have exited all prior preept-disable
929 * sections of code. So increment the counter to note this.
930 *
931 * The memory barrier is needed to handle the case where
932 * writes from a preempt-disable section of code get reordered
933 * into schedule() by this CPU's write buffer. So the memory
934 * barrier makes sure that the rcu_qsctr_inc() is seen by other
935 * CPUs to happen after any such write.
936 */
937
938 if (user ||
939 (idle_cpu(cpu) && !in_softirq() &&
940 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
941 smp_mb(); /* Guard against aggressive schedule(). */
942 rcu_qsctr_inc(cpu);
943 }
944
822 rcu_check_mb(cpu); 945 rcu_check_mb(cpu);
823 if (rcu_ctrlblk.completed == rdp->completed) 946 if (rcu_ctrlblk.completed == rdp->completed)
824 rcu_try_flip(); 947 rcu_try_flip();
@@ -869,6 +992,8 @@ void rcu_offline_cpu(int cpu)
869 struct rcu_head *list = NULL; 992 struct rcu_head *list = NULL;
870 unsigned long flags; 993 unsigned long flags;
871 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 994 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
995 struct rcu_head *schedlist = NULL;
996 struct rcu_head **schedtail = &schedlist;
872 struct rcu_head **tail = &list; 997 struct rcu_head **tail = &list;
873 998
874 /* 999 /*
@@ -882,6 +1007,11 @@ void rcu_offline_cpu(int cpu)
882 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], 1007 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
883 list, tail); 1008 list, tail);
884 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); 1009 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1010 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1011 schedlist, schedtail);
1012 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1013 schedlist, schedtail);
1014 rdp->rcu_sched_sleeping = 0;
885 spin_unlock_irqrestore(&rdp->lock, flags); 1015 spin_unlock_irqrestore(&rdp->lock, flags);
886 rdp->waitlistcount = 0; 1016 rdp->waitlistcount = 0;
887 1017
@@ -916,22 +1046,40 @@ void rcu_offline_cpu(int cpu)
916 * fix. 1046 * fix.
917 */ 1047 */
918 1048
919 local_irq_save(flags); 1049 local_irq_save(flags); /* disable preempt till we know what lock. */
920 rdp = RCU_DATA_ME(); 1050 rdp = RCU_DATA_ME();
921 spin_lock(&rdp->lock); 1051 spin_lock(&rdp->lock);
922 *rdp->nexttail = list; 1052 *rdp->nexttail = list;
923 if (list) 1053 if (list)
924 rdp->nexttail = tail; 1054 rdp->nexttail = tail;
1055 *rdp->nextschedtail = schedlist;
1056 if (schedlist)
1057 rdp->nextschedtail = schedtail;
925 spin_unlock_irqrestore(&rdp->lock, flags); 1058 spin_unlock_irqrestore(&rdp->lock, flags);
926} 1059}
927 1060
928void __devinit rcu_online_cpu(int cpu) 1061void __devinit rcu_online_cpu(int cpu)
929{ 1062{
930 unsigned long flags; 1063 unsigned long flags;
1064 struct rcu_data *rdp;
931 1065
932 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1066 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
933 cpu_set(cpu, rcu_cpu_online_map); 1067 cpu_set(cpu, rcu_cpu_online_map);
934 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1068 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1069
1070 /*
1071 * The rcu_sched grace-period processing might have bypassed
1072 * this CPU, given that it was not in the rcu_cpu_online_map
1073 * when the grace-period scan started. This means that the
1074 * grace-period task might sleep. So make sure that if this
1075 * should happen, the first callback posted to this CPU will
1076 * wake up the grace-period task if need be.
1077 */
1078
1079 rdp = RCU_DATA_CPU(cpu);
1080 spin_lock_irqsave(&rdp->lock, flags);
1081 rdp->rcu_sched_sleeping = 1;
1082 spin_unlock_irqrestore(&rdp->lock, flags);
935} 1083}
936 1084
937#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -986,31 +1134,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
986 *rdp->nexttail = head; 1134 *rdp->nexttail = head;
987 rdp->nexttail = &head->next; 1135 rdp->nexttail = &head->next;
988 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); 1136 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
989 spin_unlock(&rdp->lock); 1137 spin_unlock_irqrestore(&rdp->lock, flags);
990 local_irq_restore(flags);
991} 1138}
992EXPORT_SYMBOL_GPL(call_rcu); 1139EXPORT_SYMBOL_GPL(call_rcu);
993 1140
1141void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1142{
1143 unsigned long flags;
1144 struct rcu_data *rdp;
1145 int wake_gp = 0;
1146
1147 head->func = func;
1148 head->next = NULL;
1149 local_irq_save(flags);
1150 rdp = RCU_DATA_ME();
1151 spin_lock(&rdp->lock);
1152 *rdp->nextschedtail = head;
1153 rdp->nextschedtail = &head->next;
1154 if (rdp->rcu_sched_sleeping) {
1155
1156 /* Grace-period processing might be sleeping... */
1157
1158 rdp->rcu_sched_sleeping = 0;
1159 wake_gp = 1;
1160 }
1161 spin_unlock_irqrestore(&rdp->lock, flags);
1162 if (wake_gp) {
1163
1164 /* Wake up grace-period processing, unless someone beat us. */
1165
1166 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1167 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1168 wake_gp = 0;
1169 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1170 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1171 if (wake_gp)
1172 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1173 }
1174}
1175EXPORT_SYMBOL_GPL(call_rcu_sched);
1176
994/* 1177/*
995 * Wait until all currently running preempt_disable() code segments 1178 * Wait until all currently running preempt_disable() code segments
996 * (including hardware-irq-disable segments) complete. Note that 1179 * (including hardware-irq-disable segments) complete. Note that
997 * in -rt this does -not- necessarily result in all currently executing 1180 * in -rt this does -not- necessarily result in all currently executing
998 * interrupt -handlers- having completed. 1181 * interrupt -handlers- having completed.
999 */ 1182 */
1000void __synchronize_sched(void) 1183synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
1184EXPORT_SYMBOL_GPL(__synchronize_sched);
1185
1186/*
1187 * kthread function that manages call_rcu_sched grace periods.
1188 */
1189static int rcu_sched_grace_period(void *arg)
1001{ 1190{
1002 cpumask_t oldmask; 1191 int couldsleep; /* might sleep after current pass. */
1192 int couldsleepnext = 0; /* might sleep after next pass. */
1003 int cpu; 1193 int cpu;
1194 unsigned long flags;
1195 struct rcu_data *rdp;
1196 int ret;
1004 1197
1005 if (sched_getaffinity(0, &oldmask) < 0) 1198 /*
1006 oldmask = cpu_possible_map; 1199 * Each pass through the following loop handles one
1007 for_each_online_cpu(cpu) { 1200 * rcu_sched grace period cycle.
1008 sched_setaffinity(0, &cpumask_of_cpu(cpu)); 1201 */
1009 schedule(); 1202 do {
1010 } 1203 /* Save each CPU's current state. */
1011 sched_setaffinity(0, &oldmask); 1204
1205 for_each_online_cpu(cpu) {
1206 dyntick_save_progress_counter_sched(cpu);
1207 save_qsctr_sched(cpu);
1208 }
1209
1210 /*
1211 * Sleep for about an RCU grace-period's worth to
1212 * allow better batching and to consume less CPU.
1213 */
1214 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1215
1216 /*
1217 * If there was nothing to do last time, prepare to
1218 * sleep at the end of the current grace period cycle.
1219 */
1220 couldsleep = couldsleepnext;
1221 couldsleepnext = 1;
1222 if (couldsleep) {
1223 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1224 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1225 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1226 }
1227
1228 /*
1229 * Wait on each CPU in turn to have either visited
1230 * a quiescent state or been in dynticks-idle mode.
1231 */
1232 for_each_online_cpu(cpu) {
1233 while (rcu_qsctr_inc_needed(cpu) &&
1234 rcu_qsctr_inc_needed_dyntick(cpu)) {
1235 /* resched_cpu(cpu); @@@ */
1236 schedule_timeout_interruptible(1);
1237 }
1238 }
1239
1240 /* Advance callbacks for each CPU. */
1241
1242 for_each_online_cpu(cpu) {
1243
1244 rdp = RCU_DATA_CPU(cpu);
1245 spin_lock_irqsave(&rdp->lock, flags);
1246
1247 /*
1248 * We are running on this CPU irq-disabled, so no
1249 * CPU can go offline until we re-enable irqs.
1250 * The current CPU might have already gone
1251 * offline (between the for_each_offline_cpu and
1252 * the spin_lock_irqsave), but in that case all its
1253 * callback lists will be empty, so no harm done.
1254 *
1255 * Advance the callbacks! We share normal RCU's
1256 * donelist, since callbacks are invoked the
1257 * same way in either case.
1258 */
1259 if (rdp->waitschedlist != NULL) {
1260 *rdp->donetail = rdp->waitschedlist;
1261 rdp->donetail = rdp->waitschedtail;
1262
1263 /*
1264 * Next rcu_check_callbacks() will
1265 * do the required raise_softirq().
1266 */
1267 }
1268 if (rdp->nextschedlist != NULL) {
1269 rdp->waitschedlist = rdp->nextschedlist;
1270 rdp->waitschedtail = rdp->nextschedtail;
1271 couldsleep = 0;
1272 couldsleepnext = 0;
1273 } else {
1274 rdp->waitschedlist = NULL;
1275 rdp->waitschedtail = &rdp->waitschedlist;
1276 }
1277 rdp->nextschedlist = NULL;
1278 rdp->nextschedtail = &rdp->nextschedlist;
1279
1280 /* Mark sleep intention. */
1281
1282 rdp->rcu_sched_sleeping = couldsleep;
1283
1284 spin_unlock_irqrestore(&rdp->lock, flags);
1285 }
1286
1287 /* If we saw callbacks on the last scan, go deal with them. */
1288
1289 if (!couldsleep)
1290 continue;
1291
1292 /* Attempt to block... */
1293
1294 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1295 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1296
1297 /*
1298 * Someone posted a callback after we scanned.
1299 * Go take care of it.
1300 */
1301 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1302 couldsleepnext = 0;
1303 continue;
1304 }
1305
1306 /* Block until the next person posts a callback. */
1307
1308 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1309 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1310 ret = 0;
1311 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1312 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1313 ret);
1314
1315 /*
1316 * Signals would prevent us from sleeping, and we cannot
1317 * do much with them in any case. So flush them.
1318 */
1319 if (ret)
1320 flush_signals(current);
1321 couldsleepnext = 0;
1322
1323 } while (!kthread_should_stop());
1324
1325 return (0);
1012} 1326}
1013EXPORT_SYMBOL_GPL(__synchronize_sched);
1014 1327
1015/* 1328/*
1016 * Check to see if any future RCU-related work will need to be done 1329 * Check to see if any future RCU-related work will need to be done
@@ -1027,7 +1340,9 @@ int rcu_needs_cpu(int cpu)
1027 1340
1028 return (rdp->donelist != NULL || 1341 return (rdp->donelist != NULL ||
1029 !!rdp->waitlistcount || 1342 !!rdp->waitlistcount ||
1030 rdp->nextlist != NULL); 1343 rdp->nextlist != NULL ||
1344 rdp->nextschedlist != NULL ||
1345 rdp->waitschedlist != NULL);
1031} 1346}
1032 1347
1033int rcu_pending(int cpu) 1348int rcu_pending(int cpu)
@@ -1038,7 +1353,9 @@ int rcu_pending(int cpu)
1038 1353
1039 if (rdp->donelist != NULL || 1354 if (rdp->donelist != NULL ||
1040 !!rdp->waitlistcount || 1355 !!rdp->waitlistcount ||
1041 rdp->nextlist != NULL) 1356 rdp->nextlist != NULL ||
1357 rdp->nextschedlist != NULL ||
1358 rdp->waitschedlist != NULL)
1042 return 1; 1359 return 1;
1043 1360
1044 /* The RCU core needs an acknowledgement from this CPU. */ 1361 /* The RCU core needs an acknowledgement from this CPU. */
@@ -1105,6 +1422,11 @@ void __init __rcu_init(void)
1105 rdp->donetail = &rdp->donelist; 1422 rdp->donetail = &rdp->donelist;
1106 rdp->rcu_flipctr[0] = 0; 1423 rdp->rcu_flipctr[0] = 0;
1107 rdp->rcu_flipctr[1] = 0; 1424 rdp->rcu_flipctr[1] = 0;
1425 rdp->nextschedlist = NULL;
1426 rdp->nextschedtail = &rdp->nextschedlist;
1427 rdp->waitschedlist = NULL;
1428 rdp->waitschedtail = &rdp->waitschedlist;
1429 rdp->rcu_sched_sleeping = 0;
1108 } 1430 }
1109 register_cpu_notifier(&rcu_nb); 1431 register_cpu_notifier(&rcu_nb);
1110 1432
@@ -1127,11 +1449,15 @@ void __init __rcu_init(void)
1127} 1449}
1128 1450
1129/* 1451/*
1130 * Deprecated, use synchronize_rcu() or synchronize_sched() instead. 1452 * Late-boot-time RCU initialization that must wait until after scheduler
1453 * has been initialized.
1131 */ 1454 */
1132void synchronize_kernel(void) 1455void __init rcu_init_sched(void)
1133{ 1456{
1134 synchronize_rcu(); 1457 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1458 NULL,
1459 "rcu_sched_grace_period");
1460 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1135} 1461}
1136 1462
1137#ifdef CONFIG_RCU_TRACE 1463#ifdef CONFIG_RCU_TRACE