From 9b2e4f1880b789be1f24f9684f7a54b90310b5c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Sep 2011 12:10:22 -0700 Subject: rcu: Track idleness independent of idle tasks Earlier versions of RCU used the scheduling-clock tick to detect idleness by checking for the idle task, but handled idleness differently for CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side critical sections in the idle task, for example, for tracing. A more fine-grained detection of idleness is therefore required. This commit presses the old dyntick-idle code into full-time service, so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is always invoked at the beginning of an idle loop iteration. Similarly, rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked at the end of an idle-loop iteration. This allows the idle task to use RCU everywhere except between consecutive rcu_idle_enter() and rcu_idle_exit() calls, in turn allowing architecture maintainers to specify exactly where in the idle loop that RCU may be used. Because some of the userspace upcall uses can result in what looks to RCU like half of an interrupt, it is not possible to expect that the irq_enter() and irq_exit() hooks will give exact counts. This patch therefore expands the ->dynticks_nesting counter to 64 bits and uses two separate bitfields to count process/idle transitions and interrupt entry/exit transitions. It is presumed that userspace upcalls do not happen in the idle loop or from usermode execution (though usermode might do a system call that results in an upcall). The counter is hard-reset on each process/idle transition, which avoids the interrupt entry/exit error from accumulating. Overflow is avoided by the 64-bitness of the ->dyntick_nesting counter. This commit also adds warnings if a non-idle task asks RCU to enter idle state (and these checks will need some adjustment before applying Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246). In addition, validation of ->dynticks and ->dynticks_nesting is added. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/hardirq.h | 21 --------------------- include/linux/rcupdate.h | 21 ++++----------------- include/linux/tick.h | 11 +++++++++-- include/trace/events/rcu.h | 10 ++++++---- 4 files changed, 19 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f743883f769..bb7f3097185 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) extern void account_system_vtime(struct task_struct *tsk); #endif -#if defined(CONFIG_NO_HZ) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -static inline void rcu_irq_enter(void) -{ - rcu_exit_nohz(); -} - -static inline void rcu_irq_exit(void) -{ - rcu_enter_nohz(); -} static inline void rcu_nmi_enter(void) { @@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void) } #else -extern void rcu_irq_enter(void); -extern void rcu_irq_exit(void); extern void rcu_nmi_enter(void); extern void rcu_nmi_exit(void); #endif -#else -# define rcu_irq_enter() do { } while (0) -# define rcu_irq_exit() do { } while (0) -# define rcu_nmi_enter() do { } while (0) -# define rcu_nmi_exit() do { } while (0) -#endif /* #if defined(CONFIG_NO_HZ) */ /* * It is safe to do non-atomic ops on ->hardirq_context, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2cf4226ade7..cd1ad4b04c6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -177,23 +177,10 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) -{ -} - -static inline void rcu_exit_nohz(void) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ */ +extern void rcu_idle_enter(void); +extern void rcu_idle_exit(void); +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); /* * Infrastructure to implement the synchronize_() primitives in diff --git a/include/linux/tick.h b/include/linux/tick.h index b232ccc0ee2..ca40838fdfb 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -127,8 +127,15 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) { } -static inline void tick_nohz_restart_sched_tick(void) { } +static inline void tick_nohz_stop_sched_tick(int inidle) +{ + if (inidle) + rcu_idle_enter(); +} +static inline void tick_nohz_restart_sched_tick(void) +{ + rcu_idle_exit(); +} static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 669fbd62ec2..e5771804c50 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,19 +246,21 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity), + TP_PROTO(char *polarity, int nesting), - TP_ARGS(polarity), + TP_ARGS(polarity, nesting), TP_STRUCT__entry( __field(char *, polarity) + __field(int, nesting) ), TP_fast_assign( __entry->polarity = polarity; + __entry->nesting = nesting; ), - TP_printk("%s", __entry->polarity) + TP_printk("%s %d", __entry->polarity, __entry->nesting) ); /* @@ -443,7 +445,7 @@ TRACE_EVENT(rcu_batch_end, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity) do { } while (0) +#define trace_rcu_dyntick(polarity, nesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) -- cgit v1.2.2 From 91afaf300269aa99a4d646969b3258b74294ac4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 07:44:32 -0700 Subject: rcu: Add failure tracing to rcutorture Trace the rcutorture RCU accesses and dump the trace buffer when the first failure is detected. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 8 ++++++++ include/trace/events/rcu.h | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index cd1ad4b04c6..8d315b013e3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) extern void rcutorture_record_test_transition(void); extern void rcutorture_record_progress(unsigned long vernum); +extern void do_trace_rcu_torture_read(char *rcutorturename, + struct rcu_head *rhp); #else static inline void rcutorture_record_test_transition(void) { @@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } +#ifdef CONFIG_RCU_TRACE +extern void do_trace_rcu_torture_read(char *rcutorturename, + struct rcu_head *rhp); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#endif #endif #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e5771804c50..172620a92b1 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -437,6 +437,31 @@ TRACE_EVENT(rcu_batch_end, __entry->rcuname, __entry->callbacks_invoked) ); +/* + * Tracepoint for rcutorture readers. The first argument is the name + * of the RCU flavor from rcutorture's viewpoint and the second argument + * is the callback address. + */ +TRACE_EVENT(rcu_torture_read, + + TP_PROTO(char *rcutorturename, struct rcu_head *rhp), + + TP_ARGS(rcutorturename, rhp), + + TP_STRUCT__entry( + __field(char *, rcutorturename) + __field(struct rcu_head *, rhp) + ), + + TP_fast_assign( + __entry->rcutorturename = rcutorturename; + __entry->rhp = rhp; + ), + + TP_printk("%s torture read %p", + __entry->rcutorturename, __entry->rhp) +); + #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) @@ -452,6 +477,7 @@ TRACE_EVENT(rcu_batch_end, #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) #define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) +#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.2 From e6b80a3b0994ea6c3d876d72464f2debbfcfeb05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:25:18 -0700 Subject: rcu: Detect illegal rcu dereference in extended quiescent state Report that none of the rcu read lock maps are held while in an RCU extended quiescent state (the section between rcu_idle_enter() and rcu_idle_exit()). This helps detect any use of rcu_dereference() and friends from within the section in idle where RCU is not allowed. This way we can guarantee an extended quiescent window where the CPU can be put in dyntick idle mode or can simply aoid to be part of any global grace period completion while in the idle loop. Uses of RCU from such mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8d315b013e3..bf91fcfe181 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -228,6 +228,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) #ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifdef CONFIG_PROVE_RCU +extern int rcu_is_cpu_idle(void); +#else /* !CONFIG_PROVE_RCU */ +static inline int rcu_is_cpu_idle(void) +{ + return 0; +} +#endif /* else !CONFIG_PROVE_RCU */ + extern struct lockdep_map rcu_lock_map; # define rcu_read_acquire() \ lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) @@ -262,6 +271,8 @@ static inline int rcu_read_lock_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return lock_is_held(&rcu_lock_map); } @@ -285,6 +296,19 @@ extern int rcu_read_lock_bh_held(void); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -293,6 +317,8 @@ static inline int rcu_read_lock_sched_held(void) if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); -- cgit v1.2.2 From 00f49e5729af602deb559b0cf293a00b625e8636 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:02 +0200 Subject: rcu: Warn when rcu_read_lock() is used in extended quiescent state We are currently able to detect uses of rcu_dereference_check() inside extended quiescent states (such as the RCU-free window in idle). But rcu_read_lock() and friends can be used without rcu_dereference(), so that the earlier commit checking for use of rcu_dereference() and friends while in RCU idle mode miss some error conditions. This commit therefore adds extended quiescent state checking to rcu_read_lock() and friends. Uses of RCU from within RCU-idle mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 52 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bf91fcfe181..d201c155f70 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -237,21 +237,53 @@ static inline int rcu_is_cpu_idle(void) } #endif /* else !CONFIG_PROVE_RCU */ +static inline void rcu_lock_acquire(struct lockdep_map *map) +{ + WARN_ON_ONCE(rcu_is_cpu_idle()); + lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_); +} + +static inline void rcu_lock_release(struct lockdep_map *map) +{ + WARN_ON_ONCE(rcu_is_cpu_idle()); + lock_release(map, 1, _THIS_IP_); +} + extern struct lockdep_map rcu_lock_map; -# define rcu_read_acquire() \ - lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire(void) +{ + rcu_lock_acquire(&rcu_lock_map); +} + +static inline void rcu_read_release(void) +{ + rcu_lock_release(&rcu_lock_map); +} extern struct lockdep_map rcu_bh_lock_map; -# define rcu_read_acquire_bh() \ - lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire_bh(void) +{ + rcu_lock_acquire(&rcu_bh_lock_map); +} + +static inline void rcu_read_release_bh(void) +{ + rcu_lock_release(&rcu_bh_lock_map); +} extern struct lockdep_map rcu_sched_lock_map; -# define rcu_read_acquire_sched() \ - lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release_sched() \ - lock_release(&rcu_sched_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire_sched(void) +{ + rcu_lock_acquire(&rcu_sched_lock_map); +} + +static inline void rcu_read_release_sched(void) +{ + rcu_lock_release(&rcu_sched_lock_map); +} extern int debug_lockdep_rcu_enabled(void); -- cgit v1.2.2 From d8ab29f8be918b34a1ccd174569a53f0eb04b0a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:03 +0200 Subject: rcu: Remove one layer of abstraction from PROVE_RCU checking Simplify things a bit by substituting the definitions of the single-line rcu_read_acquire(), rcu_read_release(), rcu_read_acquire_bh(), rcu_read_release_bh(), rcu_read_acquire_sched(), and rcu_read_release_sched() functions at their call points. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 53 ++++++++---------------------------------------- 1 file changed, 8 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d201c155f70..5dd6fd8b320 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -250,41 +250,8 @@ static inline void rcu_lock_release(struct lockdep_map *map) } extern struct lockdep_map rcu_lock_map; - -static inline void rcu_read_acquire(void) -{ - rcu_lock_acquire(&rcu_lock_map); -} - -static inline void rcu_read_release(void) -{ - rcu_lock_release(&rcu_lock_map); -} - extern struct lockdep_map rcu_bh_lock_map; - -static inline void rcu_read_acquire_bh(void) -{ - rcu_lock_acquire(&rcu_bh_lock_map); -} - -static inline void rcu_read_release_bh(void) -{ - rcu_lock_release(&rcu_bh_lock_map); -} - extern struct lockdep_map rcu_sched_lock_map; - -static inline void rcu_read_acquire_sched(void) -{ - rcu_lock_acquire(&rcu_sched_lock_map); -} - -static inline void rcu_read_release_sched(void) -{ - rcu_lock_release(&rcu_sched_lock_map); -} - extern int debug_lockdep_rcu_enabled(void); /** @@ -364,12 +331,8 @@ static inline int rcu_read_lock_sched_held(void) #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -# define rcu_read_acquire() do { } while (0) -# define rcu_read_release() do { } while (0) -# define rcu_read_acquire_bh() do { } while (0) -# define rcu_read_release_bh() do { } while (0) -# define rcu_read_acquire_sched() do { } while (0) -# define rcu_read_release_sched() do { } while (0) +# define rcu_lock_acquire(a) do { } while (0) +# define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { @@ -690,7 +653,7 @@ static inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); - rcu_read_acquire(); + rcu_lock_acquire(&rcu_lock_map); } /* @@ -710,7 +673,7 @@ static inline void rcu_read_lock(void) */ static inline void rcu_read_unlock(void) { - rcu_read_release(); + rcu_lock_release(&rcu_lock_map); __release(RCU); __rcu_read_unlock(); } @@ -731,7 +694,7 @@ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); - rcu_read_acquire_bh(); + rcu_lock_acquire(&rcu_bh_lock_map); } /* @@ -741,7 +704,7 @@ static inline void rcu_read_lock_bh(void) */ static inline void rcu_read_unlock_bh(void) { - rcu_read_release_bh(); + rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } @@ -758,7 +721,7 @@ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); - rcu_read_acquire_sched(); + rcu_lock_acquire(&rcu_sched_lock_map); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ @@ -775,7 +738,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void) */ static inline void rcu_read_unlock_sched(void) { - rcu_read_release_sched(); + rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } -- cgit v1.2.2 From ff195cb69ba8d2af9b891be3a26db95fe1999d43 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:04 +0200 Subject: rcu: Warn when srcu_read_lock() is used in an extended quiescent state Catch SRCU up to the other variants of RCU by making PROVE_RCU complain if either srcu_read_lock() or srcu_read_lock_held() are used from within RCU-idle mode. Frederic reworked this to allow for the new versions of his patches that check for extended quiescent states. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/srcu.h | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 58971e891f4..4e0a3d41dae 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -28,6 +28,7 @@ #define _LINUX_SRCU_H #include +#include struct srcu_struct_array { int c[2]; @@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, __init_srcu_struct((sp), #sp, &__srcu_key); \ }) -# define srcu_read_acquire(sp) \ - lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define srcu_read_release(sp) \ - lock_release(&(sp)->dep_map, 1, _THIS_IP_) - #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *sp); -# define srcu_read_acquire(sp) do { } while (0) -# define srcu_read_release(sp) do { } while (0) - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ void cleanup_srcu_struct(struct srcu_struct *sp); @@ -90,12 +83,29 @@ long srcu_batches_completed(struct srcu_struct *sp); * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. + * + * Note that if the CPU is in the idle loop from an RCU point of view + * (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then srcu_read_lock_held() returns false even if + * the CPU did an srcu_read_lock(). The reason for this is that RCU + * ignores CPUs that are in such a section, considering these as in + * extended quiescent state, so such a CPU is effectively never in an + * RCU read-side critical section regardless of what RCU primitives it + * invokes. This state of affairs is required --- we need to keep an + * RCU-free window in idle where the CPU may possibly enter into low + * power mode. This way we can notice an extended quiescent state to + * other CPUs that started a grace period. Otherwise we would delay any + * grace period as long as we run in the idle task. */ static inline int srcu_read_lock_held(struct srcu_struct *sp) { - if (debug_locks) - return lock_is_held(&sp->dep_map); - return 1; + if (rcu_is_cpu_idle()) + return 0; + + if (!debug_locks) + return 1; + + return lock_is_held(&sp->dep_map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -150,7 +160,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { int retval = __srcu_read_lock(sp); - srcu_read_acquire(sp); + rcu_lock_acquire(&(sp)->dep_map); return retval; } @@ -164,7 +174,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp) { - srcu_read_release(sp); + rcu_lock_release(&(sp)->dep_map); __srcu_read_unlock(sp, idx); } -- cgit v1.2.2 From 867f236bd12f5091df6dc7cc75f94d7fd982d78a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:05 +0200 Subject: rcu: Make srcu_read_lock_held() call common lockdep-enabled function A common debug_lockdep_rcu_enabled() function is used to check whether RCU lockdep splats should be reported, but srcu_read_lock() does not use it. This commit therefore brings srcu_read_lock_held() up to date. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/srcu.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4e0a3d41dae..d4b12443b2e 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -84,6 +84,9 @@ long srcu_batches_completed(struct srcu_struct *sp); * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * * Note that if the CPU is in the idle loop from an RCU point of view * (ie: that we are in the section between rcu_idle_enter() and * rcu_idle_exit()) then srcu_read_lock_held() returns false even if @@ -102,7 +105,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) if (rcu_is_cpu_idle()) return 0; - if (!debug_locks) + if (!debug_lockdep_rcu_enabled()) return 1; return lock_is_held(&sp->dep_map); -- cgit v1.2.2 From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:06 +0200 Subject: nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/tick.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index ca40838fdfb..0df1d50a408 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -121,21 +121,22 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_stop_sched_tick(int inidle); -extern void tick_nohz_restart_sched_tick(void); +extern void tick_nohz_idle_enter(void); +extern void tick_nohz_idle_exit(void); +extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) +static inline void tick_nohz_idle_enter(void) { - if (inidle) - rcu_idle_enter(); + rcu_idle_enter(); } -static inline void tick_nohz_restart_sched_tick(void) +static inline void tick_nohz_idle_exit(void) { rcu_idle_exit(); } + static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; -- cgit v1.2.2 From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- include/linux/tick.h | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index 0df1d50a408..327434a0575 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -7,6 +7,7 @@ #define _LINUX_TICK_H #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -121,18 +122,57 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_idle_enter(void); +extern void __tick_nohz_idle_enter(void); +static inline void tick_nohz_idle_enter(void) +{ + local_irq_disable(); + __tick_nohz_idle_enter(); + local_irq_enable(); +} extern void tick_nohz_idle_exit(void); + +/* + * Call this pair of function if the arch doesn't make any use + * of RCU in-between. You won't need to call rcu_idle_enter() and + * rcu_idle_exit(). + * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() + * and explicitly tell RCU about the window around the place the CPU enters low + * power mode where no RCU use is made. This is done by calling rcu_idle_enter() + * after the last use of RCU before the CPU is put to sleep and by calling + * rcu_idle_exit() before the first use of RCU after the CPU woke up. + */ +static inline void tick_nohz_idle_enter_norcu(void) +{ + /* + * Also call rcu_idle_enter() in the irq disabled section even + * if it disables irq itself. + * Just an optimization that prevents from an interrupt happening + * between it and __tick_nohz_idle_enter() to lose time to help + * completing a grace period while we could be in extended grace + * period already. + */ + local_irq_disable(); + __tick_nohz_idle_enter(); + rcu_idle_enter(); + local_irq_enable(); +} +static inline void tick_nohz_idle_exit_norcu(void) +{ + rcu_idle_exit(); + tick_nohz_idle_exit(); +} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_idle_enter(void) +static inline void tick_nohz_idle_enter(void) { } +static inline void tick_nohz_idle_exit(void) { } +static inline void tick_nohz_idle_enter_norcu(void) { rcu_idle_enter(); } -static inline void tick_nohz_idle_exit(void) +static inline void tick_nohz_idle_exit_norcu(void) { rcu_idle_exit(); } -- cgit v1.2.2 From 0c53dd8b31404c1d7fd15be8f065ebaec615a562 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Oct 2011 15:13:11 -0700 Subject: rcu: Introduce raw SRCU read-side primitives The RCU implementations, including SRCU, are designed to be used in a lock-like fashion, so that the read-side lock and unlock primitives must execute in the same context for any given read-side critical section. This constraint is enforced by lockdep-RCU. However, there is a need to enter an SRCU read-side critical section within the context of an exception and then exit in the context of the task that encountered the exception. The cost of this capability is that the read-side operations incur the overhead of disabling interrupts. Note that although the current implementation allows a given read-side critical section to be entered by one task and then exited by another, all known possible implementations that allow this have scalability problems. Therefore, a given read-side critical section must be exited by the same task that entered it, though perhaps from an interrupt or exception handler running within that task's context. But if you are thinking in terms of interrupt handlers, make sure that you have considered the possibility of threaded interrupt handlers. Credit goes to Peter Zijlstra for suggesting use of the existing _raw suffix to indicate disabling lockdep over the earlier "bulkref" names. Requested-by: Srikar Dronamraju Signed-off-by: Paul E. McKenney Tested-by: Srikar Dronamraju --- include/linux/srcu.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d4b12443b2e..1eb520cd168 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -181,4 +181,47 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } +/** + * srcu_read_lock_raw - register a new reader for an SRCU-protected structure. + * @sp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section. Similar to srcu_read_lock(), + * but avoids the RCU-lockdep checking. This means that it is legal to + * use srcu_read_lock_raw() in one context, for example, in an exception + * handler, and then have the matching srcu_read_unlock_raw() in another + * context, for example in the task that took the exception. + * + * However, the entire SRCU read-side critical section must reside within a + * single task. For example, beware of using srcu_read_lock_raw() in + * a device interrupt handler and srcu_read_unlock() in the interrupted + * task: This will not work if interrupts are threaded. + */ +static inline int srcu_read_lock_raw(struct srcu_struct *sp) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __srcu_read_lock(sp); + local_irq_restore(flags); + return ret; +} + +/** + * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure. + * @sp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock_raw(). + * + * Exit an SRCU read-side critical section without lockdep-RCU checking. + * See srcu_read_lock_raw() for more details. + */ +static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx) +{ + unsigned long flags; + + local_irq_save(flags); + __srcu_read_unlock(sp, idx); + local_irq_restore(flags); +} + #endif -- cgit v1.2.2 From 4145fa7fbee3ec1e61c52825b146192885d9759f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Oct 2011 15:01:54 -0700 Subject: rcu: Deconfuse dynticks entry-exit tracing The trace_rcu_dyntick() trace event did not print both the old and the new value of the nesting level, and furthermore printed only the low-order 32 bits of it. This could result in some confusion when interpreting trace-event dumps, so this commit prints both the old and the new value, prints the full 64 bits, and also selects the process-entry/exit increment to print nicely in hexadecimal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 172620a92b1..c29fb2f5590 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,21 +246,24 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity, int nesting), + TP_PROTO(char *polarity, long long oldnesting, long long newnesting), - TP_ARGS(polarity, nesting), + TP_ARGS(polarity, oldnesting, newnesting), TP_STRUCT__entry( __field(char *, polarity) - __field(int, nesting) + __field(long long, oldnesting) + __field(long long, newnesting) ), TP_fast_assign( __entry->polarity = polarity; - __entry->nesting = nesting; + __entry->oldnesting = oldnesting; + __entry->newnesting = newnesting; ), - TP_printk("%s %d", __entry->polarity, __entry->nesting) + TP_printk("%s %llx %llx", __entry->polarity, + __entry->oldnesting, __entry->newnesting) ); /* @@ -470,7 +473,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity, nesting) do { } while (0) +#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) -- cgit v1.2.2 From c4f3060843506ba6d473ab9a0afe5bd5dc93a00d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 12:41:56 -0800 Subject: sched: Add is_idle_task() to handle invalidated uses of idle_cpu() Commit 908a3283 (Fix idle_cpu()) invalidated some uses of idle_cpu(), which used to say whether or not the CPU was running the idle task, but now instead says whether or not the CPU is running the idle task in the absence of pending wakeups. Although this new implementation gives a better answer to the question "is this CPU idle?", it also invalidates other uses that were made of idle_cpu(). This commit therefore introduces a new is_idle_task() API member that determines whether or not the specified task is one of the idle tasks, allowing open-coded "->pid == 0" sequences to be replaced by something more meaningful. Suggested-by: Josh Triplett Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc..4a7e4d333a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2070,6 +2070,14 @@ extern int sched_setscheduler(struct task_struct *, int, extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern struct task_struct *idle_task(int cpu); +/** + * is_idle_task - is the specified task an idle task? + * @tsk: the task in question. + */ +static inline bool is_idle_task(struct task_struct *p) +{ + return p->pid == 0; +} extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); -- cgit v1.2.2 From 1268fbc746ea1cd279886a740dcbad4ba5232225 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Nov 2011 18:48:14 +0100 Subject: nohz: Remove tick_nohz_idle_enter_norcu() / tick_nohz_idle_exit_norcu() Those two APIs were provided to optimize the calls of tick_nohz_idle_enter() and rcu_idle_enter() into a single irq disabled section. This way no interrupt happening in-between would needlessly process any RCU job. Now we are talking about an optimization for which benefits have yet to be measured. Let's start simple and completely decouple idle rcu and dyntick idle logics to simplify. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- include/linux/tick.h | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index 327434a0575..ab8be90b5cc 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -122,45 +122,8 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void __tick_nohz_idle_enter(void); -static inline void tick_nohz_idle_enter(void) -{ - local_irq_disable(); - __tick_nohz_idle_enter(); - local_irq_enable(); -} +extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); - -/* - * Call this pair of function if the arch doesn't make any use - * of RCU in-between. You won't need to call rcu_idle_enter() and - * rcu_idle_exit(). - * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() - * and explicitly tell RCU about the window around the place the CPU enters low - * power mode where no RCU use is made. This is done by calling rcu_idle_enter() - * after the last use of RCU before the CPU is put to sleep and by calling - * rcu_idle_exit() before the first use of RCU after the CPU woke up. - */ -static inline void tick_nohz_idle_enter_norcu(void) -{ - /* - * Also call rcu_idle_enter() in the irq disabled section even - * if it disables irq itself. - * Just an optimization that prevents from an interrupt happening - * between it and __tick_nohz_idle_enter() to lose time to help - * completing a grace period while we could be in extended grace - * period already. - */ - local_irq_disable(); - __tick_nohz_idle_enter(); - rcu_idle_enter(); - local_irq_enable(); -} -static inline void tick_nohz_idle_exit_norcu(void) -{ - rcu_idle_exit(); - tick_nohz_idle_exit(); -} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); @@ -168,14 +131,6 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } -static inline void tick_nohz_idle_enter_norcu(void) -{ - rcu_idle_enter(); -} -static inline void tick_nohz_idle_exit_norcu(void) -{ - rcu_idle_exit(); -} static inline ktime_t tick_nohz_get_sleep_length(void) { -- cgit v1.2.2 From 045fb9315a2129023d70a0eecf0942e18fca4fcd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 12:13:03 -0800 Subject: rcu: Update trace_rcu_dyntick() header comment This commit updates the trace_rcu_dyntick() header comment to reflect events added by commit 4b4f421. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c29fb2f5590..7f6877a3505 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -241,8 +241,16 @@ TRACE_EVENT(rcu_fqs, /* * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode and "End" for - * leaving it. + * as argument: "Start" for entering dyntick-idle mode, "End" for + * leaving it, "--=" for events moving towards idle, and "++=" for events + * moving away from idle. "Error on entry: not idle task" and "Error on + * exit: not idle task" indicate that a non-idle task is erroneously + * toying with the idle loop. + * + * These events also take a pair of numbers, which indicate the nesting + * depth before and after the event of interest. Note that task-related + * events use the upper bits of each number, while interrupt-related + * events use the lower bits. */ TRACE_EVENT(rcu_dyntick, -- cgit v1.2.2 From 433cdddcd9ac5558068edd7f8d4707a70f7710f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 14:58:03 -0800 Subject: rcu: Add tracing for RCU_FAST_NO_HZ This commit adds trace_rcu_prep_idle(), which is invoked from rcu_prepare_for_idle() and rcu_wake_cpu() to trace attempts on the part of RCU to force CPUs into dyntick-idle mode. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 7f6877a3505..debe453c962 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -274,6 +274,42 @@ TRACE_EVENT(rcu_dyntick, __entry->oldnesting, __entry->newnesting) ); +/* + * Tracepoint for RCU preparation for idle, the goal being to get RCU + * processing done so that the current CPU can shut off its scheduling + * clock and enter dyntick-idle mode. One way to accomplish this is + * to drain all RCU callbacks from this CPU, and the other is to have + * done everything RCU requires for the current grace period. In this + * latter case, the CPU will be awakened at the end of the current grace + * period in order to process the remainder of its callbacks. + * + * These tracepoints take a string as argument: + * + * "No callbacks": Nothing to do, no callbacks on this CPU. + * "In holdoff": Nothing to do, holding off after unsuccessful attempt. + * "Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU. + * "Begin holdoff": Attempt failed, don't retry until next jiffy. + * "More callbacks": Still more callbacks, try again to clear them out. + * "Callbacks drained": All callbacks processed, off to dyntick idle! + * "CPU awakened at GP end": + */ +TRACE_EVENT(rcu_prep_idle, + + TP_PROTO(char *reason), + + TP_ARGS(reason), + + TP_STRUCT__entry( + __field(char *, reason) + ), + + TP_fast_assign( + __entry->reason = reason; + ), + + TP_printk("%s", __entry->reason) +); + /* * Tracepoint for the registration of a single RCU callback function. * The first argument is the type of RCU, the second argument is @@ -482,6 +518,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) +#define trace_rcu_prep_idle(reason) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) -- cgit v1.2.2 From f535a607c13c7b674e0788ca5765779aa74a01c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 20:43:02 -0800 Subject: rcu: Eliminate RCU_FAST_NO_HZ grace-period hang With the new implementation of RCU_FAST_NO_HZ, it was possible to hang RCU grace periods as follows: o CPU 0 attempts to go idle, cycles several times through the rcu_prepare_for_idle() loop, then goes dyntick-idle when RCU needs nothing more from it, while still having at least on RCU callback pending. o CPU 1 goes idle with no callbacks. Both CPUs can then stay in dyntick-idle mode indefinitely, preventing the RCU grace period from ever completing, possibly hanging the system. This commit therefore prevents CPUs that have RCU callbacks from entering dyntick-idle mode. This approach also eliminates the need for the end-of-grace-period IPIs used previously. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index debe453c962..8dd6fcb9494 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -287,7 +287,6 @@ TRACE_EVENT(rcu_dyntick, * * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. - * "Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! -- cgit v1.2.2 From 3842a0832a1d6eb0b31421f8810a813135967512 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2011 10:42:42 -0800 Subject: rcu: Document same-context read-side constraints The intent is that a given RCU read-side critical section be confined to a single context. For example, it is illegal to invoke rcu_read_lock() in an exception handler and then invoke rcu_read_unlock() from the context of the task that received the exception. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 +++++++++++++++ include/linux/srcu.h | 5 +++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5dd6fd8b320..81c04f4348e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -265,6 +265,11 @@ extern int debug_lockdep_rcu_enabled(void); * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. */ static inline int rcu_read_lock_held(void) { @@ -689,6 +694,11 @@ static inline void rcu_read_unlock(void) * critical sections in interrupt context can use just rcu_read_lock(), * though this should at least be commented to avoid confusing people * reading the code. + * + * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() + * must occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() + * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { @@ -716,6 +726,11 @@ static inline void rcu_read_unlock_bh(void) * are being done using call_rcu_sched() or synchronize_rcu_sched(). * Read-side critical sections can also be introduced by anything that * disables preemption, including local_irq_disable() and friends. + * + * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() + * must occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock_sched() from process context if the matching + * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 1eb520cd168..e1b005918bb 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -158,6 +158,11 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). + * + * Note that srcu_read_lock() and the matching srcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() + * was invoked in process context. */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { -- cgit v1.2.2 From 7cb92499000e3c86dae653077b1465458a039ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2011 12:28:34 -0800 Subject: rcu: Permit dyntick-idle with callbacks pending The current implementation of RCU_FAST_NO_HZ prevents CPUs from entering dyntick-idle state if they have RCU callbacks pending. Unfortunately, this has the side-effect of often preventing them from entering this state, especially if at least one other CPU is not in dyntick-idle state. However, the resulting per-tick wakeup is wasteful in many cases: if the CPU has already fully responded to the current RCU grace period, there will be nothing for it to do until this grace period ends, which will frequently take several jiffies. This commit therefore permits a CPU that has done everything that the current grace period has asked of it (rcu_pending() == 0) even if it still as RCU callbacks pending. However, such a CPU posts a timer to wake it up several jiffies later (6 jiffies, based on experience with grace-period lengths). This wakeup is required to handle situations that can result in all CPUs being in dyntick-idle mode, thus failing to ever complete the current grace period. If a CPU wakes up before the timer goes off, then it cancels that timer, thus avoiding spurious wakeups. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 8dd6fcb9494..c75418c3ccb 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -288,9 +288,10 @@ TRACE_EVENT(rcu_dyntick, * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. + * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! - * "CPU awakened at GP end": + * "Timer": Timer fired to cause CPU to continue processing callbacks. */ TRACE_EVENT(rcu_prep_idle, -- cgit v1.2.2 From 2987557f52b97f679f0c324d8f51b8d66e1f2084 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 3 Dec 2011 13:06:50 -0800 Subject: driver-core/cpu: Expose hotpluggability to the rest of the kernel When architectures register CPUs, they indicate whether the CPU allows hotplugging; notably, x86 and ARM don't allow hotplugging CPU 0. Userspace can easily query the hotpluggability of a CPU via sysfs; however, the kernel has no convenient way of accessing that property in an architecture-independent way. While the kernel can simply try it and see, some code needs to distinguish between "hotplug failed" and "hotplug has no hope of working on this CPU"; for example, rcutorture's CPU hotplug tests want to avoid drowning out real hotplug failures with expected failures. Expose this property via a new cpu_is_hotpluggable function, so that the rest of the kernel can access it in an architecture-independent way. Signed-off-by: Josh Triplett Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6cb60fd2ea8..305c263021e 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -27,6 +27,7 @@ struct cpu { extern int register_cpu(struct cpu *cpu, int num); extern struct sys_device *get_cpu_sysdev(unsigned cpu); +extern bool cpu_is_hotpluggable(unsigned cpu); extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr); extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr); -- cgit v1.2.2 From 4968c300e1fa5389fdf1f1ebd8b8e4aec9aa4a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Dec 2011 16:32:40 -0800 Subject: rcu: Augment rcu_batch_end tracing for idle and callback state The current rcu_batch_end event trace records only the name of the RCU flavor and the total number of callbacks that remain queued on the current CPU. This is insufficient for testing and tuning the new dyntick-idle RCU_FAST_NO_HZ code, so this commit adds idle state along with whether or not any of the callbacks that were ready to invoke at the beginning of rcu_do_batch() are still queued. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c75418c3ccb..d2d88bed891 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -461,27 +461,46 @@ TRACE_EVENT(rcu_invoke_kfree_callback, /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been - * invoked. The first argument is the name of the RCU flavor and - * the second argument is number of callbacks actually invoked. + * invoked. The first argument is the name of the RCU flavor, + * the second argument is number of callbacks actually invoked, + * the third argument (cb) is whether or not any of the callbacks that + * were ready to invoke at the beginning of this batch are still + * queued, the fourth argument (nr) is the return value of need_resched(), + * the fifth argument (iit) is 1 if the current task is the idle task, + * and the sixth argument (risk) is the return value from + * rcu_is_callbacks_kthread(). */ TRACE_EVENT(rcu_batch_end, - TP_PROTO(char *rcuname, int callbacks_invoked), + TP_PROTO(char *rcuname, int callbacks_invoked, + bool cb, bool nr, bool iit, bool risk), - TP_ARGS(rcuname, callbacks_invoked), + TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), TP_STRUCT__entry( __field(char *, rcuname) __field(int, callbacks_invoked) + __field(bool, cb) + __field(bool, nr) + __field(bool, iit) + __field(bool, risk) ), TP_fast_assign( __entry->rcuname = rcuname; __entry->callbacks_invoked = callbacks_invoked; - ), - - TP_printk("%s CBs-invoked=%d", - __entry->rcuname, __entry->callbacks_invoked) + __entry->cb = cb; + __entry->nr = nr; + __entry->iit = iit; + __entry->risk = risk; + ), + + TP_printk("%s CBs-invoked=%d idle=%c%c%c%c", + __entry->rcuname, __entry->callbacks_invoked, + __entry->cb ? 'C' : '.', + __entry->nr ? 'S' : '.', + __entry->iit ? 'I' : '.', + __entry->risk ? 'R' : '.') ); /* @@ -524,7 +543,8 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) -#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) +#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ + do { } while (0) #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.2