From 9b2e4f1880b789be1f24f9684f7a54b90310b5c0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 30 Sep 2011 12:10:22 -0700
Subject: rcu: Track idleness independent of idle tasks

Earlier versions of RCU used the scheduling-clock tick to detect idleness
by checking for the idle task, but handled idleness differently for
CONFIG_NO_HZ=y.  But there are now a number of uses of RCU read-side
critical sections in the idle task, for example, for tracing.  A more
fine-grained detection of idleness is therefore required.

This commit presses the old dyntick-idle code into full-time service,
so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is
always invoked at the beginning of an idle loop iteration.  Similarly,
rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked
at the end of an idle-loop iteration.  This allows the idle task to
use RCU everywhere except between consecutive rcu_idle_enter() and
rcu_idle_exit() calls, in turn allowing architecture maintainers to
specify exactly where in the idle loop that RCU may be used.

Because some of the userspace upcall uses can result in what looks
to RCU like half of an interrupt, it is not possible to expect that
the irq_enter() and irq_exit() hooks will give exact counts.  This
patch therefore expands the ->dynticks_nesting counter to 64 bits
and uses two separate bitfields to count process/idle transitions
and interrupt entry/exit transitions.  It is presumed that userspace
upcalls do not happen in the idle loop or from usermode execution
(though usermode might do a system call that results in an upcall).
The counter is hard-reset on each process/idle transition, which
avoids the interrupt entry/exit error from accumulating.  Overflow
is avoided by the 64-bitness of the ->dyntick_nesting counter.

This commit also adds warnings if a non-idle task asks RCU to enter
idle state (and these checks will need some adjustment before applying
Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246).
In addition, validation of ->dynticks and ->dynticks_nesting is added.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/hardirq.h    | 21 ---------------------
 include/linux/rcupdate.h   | 21 ++++-----------------
 include/linux/tick.h       | 11 +++++++++--
 include/trace/events/rcu.h | 10 ++++++----
 4 files changed, 19 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f743883f769..bb7f3097185 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
 extern void account_system_vtime(struct task_struct *tsk);
 #endif
 
-#if defined(CONFIG_NO_HZ)
 #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-static inline void rcu_irq_enter(void)
-{
-	rcu_exit_nohz();
-}
-
-static inline void rcu_irq_exit(void)
-{
-	rcu_enter_nohz();
-}
 
 static inline void rcu_nmi_enter(void)
 {
@@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void)
 }
 
 #else
-extern void rcu_irq_enter(void);
-extern void rcu_irq_exit(void);
 extern void rcu_nmi_enter(void);
 extern void rcu_nmi_exit(void);
 #endif
-#else
-# define rcu_irq_enter() do { } while (0)
-# define rcu_irq_exit() do { } while (0)
-# define rcu_nmi_enter() do { } while (0)
-# define rcu_nmi_exit() do { } while (0)
-#endif /* #if defined(CONFIG_NO_HZ) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2cf4226ade7..cd1ad4b04c6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -177,23 +177,10 @@ extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
-
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-static inline void rcu_enter_nohz(void)
-{
-}
-
-static inline void rcu_exit_nohz(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
+extern void rcu_idle_enter(void);
+extern void rcu_idle_exit(void);
+extern void rcu_irq_enter(void);
+extern void rcu_irq_exit(void);
 
 /*
  * Infrastructure to implement the synchronize_() primitives in
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc0ee2..ca40838fdfb 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -127,8 +127,15 @@ extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
-static inline void tick_nohz_stop_sched_tick(int inidle) { }
-static inline void tick_nohz_restart_sched_tick(void) { }
+static inline void tick_nohz_stop_sched_tick(int inidle)
+{
+	if (inidle)
+		rcu_idle_enter();
+}
+static inline void tick_nohz_restart_sched_tick(void)
+{
+	rcu_idle_exit();
+}
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
 	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 669fbd62ec2..e5771804c50 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -246,19 +246,21 @@ TRACE_EVENT(rcu_fqs,
  */
 TRACE_EVENT(rcu_dyntick,
 
-	TP_PROTO(char *polarity),
+	TP_PROTO(char *polarity, int nesting),
 
-	TP_ARGS(polarity),
+	TP_ARGS(polarity, nesting),
 
 	TP_STRUCT__entry(
 		__field(char *, polarity)
+		__field(int, nesting)
 	),
 
 	TP_fast_assign(
 		__entry->polarity = polarity;
+		__entry->nesting = nesting;
 	),
 
-	TP_printk("%s", __entry->polarity)
+	TP_printk("%s %d", __entry->polarity, __entry->nesting)
 );
 
 /*
@@ -443,7 +445,7 @@ TRACE_EVENT(rcu_batch_end,
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
 #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
-#define trace_rcu_dyntick(polarity) do { } while (0)
+#define trace_rcu_dyntick(polarity, nesting) do { } while (0)
 #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
 #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
-- 
cgit v1.2.2


From 91afaf300269aa99a4d646969b3258b74294ac4d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sun, 2 Oct 2011 07:44:32 -0700
Subject: rcu: Add failure tracing to rcutorture

Trace the rcutorture RCU accesses and dump the trace buffer when the
first failure is detected.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h   |  8 ++++++++
 include/trace/events/rcu.h | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index cd1ad4b04c6..8d315b013e3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 extern void rcutorture_record_test_transition(void);
 extern void rcutorture_record_progress(unsigned long vernum);
+extern void do_trace_rcu_torture_read(char *rcutorturename,
+				      struct rcu_head *rhp);
 #else
 static inline void rcutorture_record_test_transition(void)
 {
@@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void)
 static inline void rcutorture_record_progress(unsigned long vernum)
 {
 }
+#ifdef CONFIG_RCU_TRACE
+extern void do_trace_rcu_torture_read(char *rcutorturename,
+				      struct rcu_head *rhp);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
 #endif
 
 #define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index e5771804c50..172620a92b1 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -437,6 +437,31 @@ TRACE_EVENT(rcu_batch_end,
 		  __entry->rcuname, __entry->callbacks_invoked)
 );
 
+/*
+ * Tracepoint for rcutorture readers.  The first argument is the name
+ * of the RCU flavor from rcutorture's viewpoint and the second argument
+ * is the callback address.
+ */
+TRACE_EVENT(rcu_torture_read,
+
+	TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
+
+	TP_ARGS(rcutorturename, rhp),
+
+	TP_STRUCT__entry(
+		__field(char *, rcutorturename)
+		__field(struct rcu_head *, rhp)
+	),
+
+	TP_fast_assign(
+		__entry->rcutorturename = rcutorturename;
+		__entry->rhp = rhp;
+	),
+
+	TP_printk("%s torture read %p",
+		  __entry->rcutorturename, __entry->rhp)
+);
+
 #else /* #ifdef CONFIG_RCU_TRACE */
 
 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
@@ -452,6 +477,7 @@ TRACE_EVENT(rcu_batch_end,
 #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
 #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
 #define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
+#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
-- 
cgit v1.2.2


From e6b80a3b0994ea6c3d876d72464f2debbfcfeb05 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Oct 2011 16:25:18 -0700
Subject: rcu: Detect illegal rcu dereference in extended quiescent state

Report that none of the rcu read lock maps are held while in an RCU
extended quiescent state (the section between rcu_idle_enter()
and rcu_idle_exit()). This helps detect any use of rcu_dereference()
and friends from within the section in idle where RCU is not allowed.

This way we can guarantee an extended quiescent window where the CPU
can be put in dyntick idle mode or can simply aoid to be part of any
global grace period completion while in the idle loop.

Uses of RCU from such mode are totally ignored by RCU, hence the
importance of these checks.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8d315b013e3..bf91fcfe181 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -228,6 +228,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
+#ifdef CONFIG_PROVE_RCU
+extern int rcu_is_cpu_idle(void);
+#else /* !CONFIG_PROVE_RCU */
+static inline int rcu_is_cpu_idle(void)
+{
+	return 0;
+}
+#endif /* else !CONFIG_PROVE_RCU */
+
 extern struct lockdep_map rcu_lock_map;
 # define rcu_read_acquire() \
 		lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
@@ -262,6 +271,8 @@ static inline int rcu_read_lock_held(void)
 {
 	if (!debug_lockdep_rcu_enabled())
 		return 1;
+	if (rcu_is_cpu_idle())
+		return 0;
 	return lock_is_held(&rcu_lock_map);
 }
 
@@ -285,6 +296,19 @@ extern int rcu_read_lock_bh_held(void);
  *
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes.  This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
  */
 #ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
@@ -293,6 +317,8 @@ static inline int rcu_read_lock_sched_held(void)
 
 	if (!debug_lockdep_rcu_enabled())
 		return 1;
+	if (rcu_is_cpu_idle())
+		return 0;
 	if (debug_locks)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
 	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
-- 
cgit v1.2.2


From 00f49e5729af602deb559b0cf293a00b625e8636 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Oct 2011 18:22:02 +0200
Subject: rcu: Warn when rcu_read_lock() is used in extended quiescent state

We are currently able to detect uses of rcu_dereference_check() inside
extended quiescent states (such as the RCU-free window in idle).
But rcu_read_lock() and friends can be used without rcu_dereference(),
so that the earlier commit checking for use of rcu_dereference() and
friends while in RCU idle mode miss some error conditions.  This commit
therefore adds extended quiescent state checking to rcu_read_lock() and
friends.

Uses of RCU from within RCU-idle mode are totally ignored by
RCU, hence the importance of these checks.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 52 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index bf91fcfe181..d201c155f70 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -237,21 +237,53 @@ static inline int rcu_is_cpu_idle(void)
 }
 #endif /* else !CONFIG_PROVE_RCU */
 
+static inline void rcu_lock_acquire(struct lockdep_map *map)
+{
+	WARN_ON_ONCE(rcu_is_cpu_idle());
+	lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
+}
+
+static inline void rcu_lock_release(struct lockdep_map *map)
+{
+	WARN_ON_ONCE(rcu_is_cpu_idle());
+	lock_release(map, 1, _THIS_IP_);
+}
+
 extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire() \
-		lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+
+static inline void rcu_read_acquire(void)
+{
+	rcu_lock_acquire(&rcu_lock_map);
+}
+
+static inline void rcu_read_release(void)
+{
+	rcu_lock_release(&rcu_lock_map);
+}
 
 extern struct lockdep_map rcu_bh_lock_map;
-# define rcu_read_acquire_bh() \
-		lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release_bh()	lock_release(&rcu_bh_lock_map, 1, _THIS_IP_)
+
+static inline void rcu_read_acquire_bh(void)
+{
+	rcu_lock_acquire(&rcu_bh_lock_map);
+}
+
+static inline void rcu_read_release_bh(void)
+{
+	rcu_lock_release(&rcu_bh_lock_map);
+}
 
 extern struct lockdep_map rcu_sched_lock_map;
-# define rcu_read_acquire_sched() \
-		lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release_sched() \
-		lock_release(&rcu_sched_lock_map, 1, _THIS_IP_)
+
+static inline void rcu_read_acquire_sched(void)
+{
+	rcu_lock_acquire(&rcu_sched_lock_map);
+}
+
+static inline void rcu_read_release_sched(void)
+{
+	rcu_lock_release(&rcu_sched_lock_map);
+}
 
 extern int debug_lockdep_rcu_enabled(void);
 
-- 
cgit v1.2.2


From d8ab29f8be918b34a1ccd174569a53f0eb04b0a5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 7 Oct 2011 18:22:03 +0200
Subject: rcu: Remove one layer of abstraction from PROVE_RCU checking

Simplify things a bit by substituting the definitions of the single-line
rcu_read_acquire(), rcu_read_release(), rcu_read_acquire_bh(),
rcu_read_release_bh(), rcu_read_acquire_sched(), and
rcu_read_release_sched() functions at their call points.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 53 ++++++++----------------------------------------
 1 file changed, 8 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d201c155f70..5dd6fd8b320 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -250,41 +250,8 @@ static inline void rcu_lock_release(struct lockdep_map *map)
 }
 
 extern struct lockdep_map rcu_lock_map;
-
-static inline void rcu_read_acquire(void)
-{
-	rcu_lock_acquire(&rcu_lock_map);
-}
-
-static inline void rcu_read_release(void)
-{
-	rcu_lock_release(&rcu_lock_map);
-}
-
 extern struct lockdep_map rcu_bh_lock_map;
-
-static inline void rcu_read_acquire_bh(void)
-{
-	rcu_lock_acquire(&rcu_bh_lock_map);
-}
-
-static inline void rcu_read_release_bh(void)
-{
-	rcu_lock_release(&rcu_bh_lock_map);
-}
-
 extern struct lockdep_map rcu_sched_lock_map;
-
-static inline void rcu_read_acquire_sched(void)
-{
-	rcu_lock_acquire(&rcu_sched_lock_map);
-}
-
-static inline void rcu_read_release_sched(void)
-{
-	rcu_lock_release(&rcu_sched_lock_map);
-}
-
 extern int debug_lockdep_rcu_enabled(void);
 
 /**
@@ -364,12 +331,8 @@ static inline int rcu_read_lock_sched_held(void)
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-# define rcu_read_acquire()		do { } while (0)
-# define rcu_read_release()		do { } while (0)
-# define rcu_read_acquire_bh()		do { } while (0)
-# define rcu_read_release_bh()		do { } while (0)
-# define rcu_read_acquire_sched()	do { } while (0)
-# define rcu_read_release_sched()	do { } while (0)
+# define rcu_lock_acquire(a)		do { } while (0)
+# define rcu_lock_release(a)		do { } while (0)
 
 static inline int rcu_read_lock_held(void)
 {
@@ -690,7 +653,7 @@ static inline void rcu_read_lock(void)
 {
 	__rcu_read_lock();
 	__acquire(RCU);
-	rcu_read_acquire();
+	rcu_lock_acquire(&rcu_lock_map);
 }
 
 /*
@@ -710,7 +673,7 @@ static inline void rcu_read_lock(void)
  */
 static inline void rcu_read_unlock(void)
 {
-	rcu_read_release();
+	rcu_lock_release(&rcu_lock_map);
 	__release(RCU);
 	__rcu_read_unlock();
 }
@@ -731,7 +694,7 @@ static inline void rcu_read_lock_bh(void)
 {
 	local_bh_disable();
 	__acquire(RCU_BH);
-	rcu_read_acquire_bh();
+	rcu_lock_acquire(&rcu_bh_lock_map);
 }
 
 /*
@@ -741,7 +704,7 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
-	rcu_read_release_bh();
+	rcu_lock_release(&rcu_bh_lock_map);
 	__release(RCU_BH);
 	local_bh_enable();
 }
@@ -758,7 +721,7 @@ static inline void rcu_read_lock_sched(void)
 {
 	preempt_disable();
 	__acquire(RCU_SCHED);
-	rcu_read_acquire_sched();
+	rcu_lock_acquire(&rcu_sched_lock_map);
 }
 
 /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -775,7 +738,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
  */
 static inline void rcu_read_unlock_sched(void)
 {
-	rcu_read_release_sched();
+	rcu_lock_release(&rcu_sched_lock_map);
 	__release(RCU_SCHED);
 	preempt_enable();
 }
-- 
cgit v1.2.2


From ff195cb69ba8d2af9b891be3a26db95fe1999d43 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 7 Oct 2011 18:22:04 +0200
Subject: rcu: Warn when srcu_read_lock() is used in an extended quiescent
 state

Catch SRCU up to the other variants of RCU by making PROVE_RCU
complain if either srcu_read_lock() or srcu_read_lock_held() are
used from within RCU-idle mode.

Frederic reworked this to allow for the new versions of his patches
that check for extended quiescent states.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/srcu.h | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 58971e891f4..4e0a3d41dae 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -28,6 +28,7 @@
 #define _LINUX_SRCU_H
 
 #include <linux/mutex.h>
+#include <linux/rcupdate.h>
 
 struct srcu_struct_array {
 	int c[2];
@@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
 	__init_srcu_struct((sp), #sp, &__srcu_key); \
 })
 
-# define srcu_read_acquire(sp) \
-		lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define srcu_read_release(sp) \
-		lock_release(&(sp)->dep_map, 1, _THIS_IP_)
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 int init_srcu_struct(struct srcu_struct *sp);
 
-# define srcu_read_acquire(sp)  do { } while (0)
-# define srcu_read_release(sp)  do { } while (0)
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 void cleanup_srcu_struct(struct srcu_struct *sp);
@@ -90,12 +83,29 @@ long srcu_batches_completed(struct srcu_struct *sp);
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
  * this assumes we are in an SRCU read-side critical section unless it can
  * prove otherwise.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of view
+ * (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
+ * the CPU did an srcu_read_lock().  The reason for this is that RCU
+ * ignores CPUs that are in such a section, considering these as in
+ * extended quiescent state, so such a CPU is effectively never in an
+ * RCU read-side critical section regardless of what RCU primitives it
+ * invokes.  This state of affairs is required --- we need to keep an
+ * RCU-free window in idle where the CPU may possibly enter into low
+ * power mode. This way we can notice an extended quiescent state to
+ * other CPUs that started a grace period. Otherwise we would delay any
+ * grace period as long as we run in the idle task.
  */
 static inline int srcu_read_lock_held(struct srcu_struct *sp)
 {
-	if (debug_locks)
-		return lock_is_held(&sp->dep_map);
-	return 1;
+	if (rcu_is_cpu_idle())
+		return 0;
+
+	if (!debug_locks)
+		return 1;
+
+	return lock_is_held(&sp->dep_map);
 }
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -150,7 +160,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 {
 	int retval = __srcu_read_lock(sp);
 
-	srcu_read_acquire(sp);
+	rcu_lock_acquire(&(sp)->dep_map);
 	return retval;
 }
 
@@ -164,7 +174,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	__releases(sp)
 {
-	srcu_read_release(sp);
+	rcu_lock_release(&(sp)->dep_map);
 	__srcu_read_unlock(sp, idx);
 }
 
-- 
cgit v1.2.2


From 867f236bd12f5091df6dc7cc75f94d7fd982d78a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 7 Oct 2011 18:22:05 +0200
Subject: rcu: Make srcu_read_lock_held() call common lockdep-enabled function

A common debug_lockdep_rcu_enabled() function is used to check whether
RCU lockdep splats should be reported, but srcu_read_lock() does not
use it.  This commit therefore brings srcu_read_lock_held() up to date.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/srcu.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4e0a3d41dae..d4b12443b2e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -84,6 +84,9 @@ long srcu_batches_completed(struct srcu_struct *sp);
  * this assumes we are in an SRCU read-side critical section unless it can
  * prove otherwise.
  *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
  * Note that if the CPU is in the idle loop from an RCU point of view
  * (ie: that we are in the section between rcu_idle_enter() and
  * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
@@ -102,7 +105,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
 	if (rcu_is_cpu_idle())
 		return 0;
 
-	if (!debug_locks)
+	if (!debug_lockdep_rcu_enabled())
 		return 1;
 
 	return lock_is_held(&sp->dep_map);
-- 
cgit v1.2.2


From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Oct 2011 18:22:06 +0200
Subject: nohz: Separate out irq exit and idle loop dyntick logic

The tick_nohz_stop_sched_tick() function, which tries to delay
the next timer tick as long as possible, can be called from two
places:

- From the idle loop to start the dytick idle mode
- From interrupt exit if we have interrupted the dyntick
idle mode, so that we reprogram the next tick event in
case the irq changed some internal state that requires this
action.

There are only few minor differences between both that
are handled by that function, driven by the ts->inidle
cpu variable and the inidle parameter. The whole guarantees
that we only update the dyntick mode on irq exit if we actually
interrupted the dyntick idle mode, and that we enter in RCU extended
quiescent state from idle loop entry only.

Split this function into:

- tick_nohz_idle_enter(), which sets ts->inidle to 1, enters
dynticks idle mode unconditionally if it can, and enters into RCU
extended quiescent state.

- tick_nohz_irq_exit() which only updates the dynticks idle mode
when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called).

To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed
into tick_nohz_idle_exit().

This simplifies the code and micro-optimize the irq exit path (no need
for local_irq_save there). This also prepares for the split between
dynticks and rcu extended quiescent state logics. We'll need this split to
further fix illegal uses of RCU in extended quiescent states in the idle
loop.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/tick.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index ca40838fdfb..0df1d50a408 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -121,21 +121,22 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
-extern void tick_nohz_stop_sched_tick(int inidle);
-extern void tick_nohz_restart_sched_tick(void);
+extern void tick_nohz_idle_enter(void);
+extern void tick_nohz_idle_exit(void);
+extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
-static inline void tick_nohz_stop_sched_tick(int inidle)
+static inline void tick_nohz_idle_enter(void)
 {
-	if (inidle)
-		rcu_idle_enter();
+	rcu_idle_enter();
 }
-static inline void tick_nohz_restart_sched_tick(void)
+static inline void tick_nohz_idle_exit(void)
 {
 	rcu_idle_exit();
 }
+
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
 	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
-- 
cgit v1.2.2


From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Oct 2011 16:01:00 +0200
Subject: nohz: Allow rcu extended quiescent state handling seperately from
 tick stop

It is assumed that rcu won't be used once we switch to tickless
mode and until we restart the tick. However this is not always
true, as in x86-64 where we dereference the idle notifiers after
the tick is stopped.

To prepare for fixing this, add two new APIs:
tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu().

If no use of RCU is made in the idle loop between
tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch
must instead call the new *_norcu() version such that the arch doesn't
need to call rcu_idle_enter() and rcu_idle_exit().

Otherwise the arch must call tick_nohz_enter_idle() and
tick_nohz_exit_idle() and also call explicitly:

- rcu_idle_enter() after its last use of RCU before the CPU is put
to sleep.
- rcu_idle_exit() before the first use of RCU after the CPU is woken
up.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/tick.h | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0df1d50a408..327434a0575 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -7,6 +7,7 @@
 #define _LINUX_TICK_H
 
 #include <linux/clockchips.h>
+#include <linux/irqflags.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 
@@ -121,18 +122,57 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
-extern void tick_nohz_idle_enter(void);
+extern void __tick_nohz_idle_enter(void);
+static inline void tick_nohz_idle_enter(void)
+{
+	local_irq_disable();
+	__tick_nohz_idle_enter();
+	local_irq_enable();
+}
 extern void tick_nohz_idle_exit(void);
+
+/*
+ * Call this pair of function if the arch doesn't make any use
+ * of RCU in-between. You won't need to call rcu_idle_enter() and
+ * rcu_idle_exit().
+ * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit()
+ * and explicitly tell RCU about the window around the place the CPU enters low
+ * power mode where no RCU use is made. This is done by calling rcu_idle_enter()
+ * after the last use of RCU before the CPU is put to sleep and by calling
+ * rcu_idle_exit() before the first use of RCU after the CPU woke up.
+ */
+static inline void tick_nohz_idle_enter_norcu(void)
+{
+	/*
+	 * Also call rcu_idle_enter() in the irq disabled section even
+	 * if it disables irq itself.
+	 * Just an optimization that prevents from an interrupt happening
+	 * between it and __tick_nohz_idle_enter() to lose time to help
+	 * completing a grace period while we could be in extended grace
+	 * period already.
+	 */
+	local_irq_disable();
+	__tick_nohz_idle_enter();
+	rcu_idle_enter();
+	local_irq_enable();
+}
+static inline void tick_nohz_idle_exit_norcu(void)
+{
+	rcu_idle_exit();
+	tick_nohz_idle_exit();
+}
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
-static inline void tick_nohz_idle_enter(void)
+static inline void tick_nohz_idle_enter(void) { }
+static inline void tick_nohz_idle_exit(void) { }
+static inline void tick_nohz_idle_enter_norcu(void)
 {
 	rcu_idle_enter();
 }
-static inline void tick_nohz_idle_exit(void)
+static inline void tick_nohz_idle_exit_norcu(void)
 {
 	rcu_idle_exit();
 }
-- 
cgit v1.2.2


From 0c53dd8b31404c1d7fd15be8f065ebaec615a562 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 9 Oct 2011 15:13:11 -0700
Subject: rcu: Introduce raw SRCU read-side primitives

The RCU implementations, including SRCU, are designed to be used in a
lock-like fashion, so that the read-side lock and unlock primitives must
execute in the same context for any given read-side critical section.
This constraint is enforced by lockdep-RCU.  However, there is a need
to enter an SRCU read-side critical section within the context of an
exception and then exit in the context of the task that encountered the
exception.  The cost of this capability is that the read-side operations
incur the overhead of disabling interrupts.

Note that although the current implementation allows a given read-side
critical section to be entered by one task and then exited by another, all
known possible implementations that allow this have scalability problems.
Therefore, a given read-side critical section must be exited by the same
task that entered it, though perhaps from an interrupt or exception
handler running within that task's context.  But if you are thinking
in terms of interrupt handlers, make sure that you have considered the
possibility of threaded interrupt handlers.

Credit goes to Peter Zijlstra for suggesting use of the existing _raw
suffix to indicate disabling lockdep over the earlier "bulkref" names.

Requested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index d4b12443b2e..1eb520cd168 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -181,4 +181,47 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	__srcu_read_unlock(sp, idx);
 }
 
+/**
+ * srcu_read_lock_raw - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section.  Similar to srcu_read_lock(),
+ * but avoids the RCU-lockdep checking.  This means that it is legal to
+ * use srcu_read_lock_raw() in one context, for example, in an exception
+ * handler, and then have the matching srcu_read_unlock_raw() in another
+ * context, for example in the task that took the exception.
+ *
+ * However, the entire SRCU read-side critical section must reside within a
+ * single task.  For example, beware of using srcu_read_lock_raw() in
+ * a device interrupt handler and srcu_read_unlock() in the interrupted
+ * task:  This will not work if interrupts are threaded.
+ */
+static inline int srcu_read_lock_raw(struct srcu_struct *sp)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret =  __srcu_read_lock(sp);
+	local_irq_restore(flags);
+	return ret;
+}
+
+/**
+ * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock_raw().
+ *
+ * Exit an SRCU read-side critical section without lockdep-RCU checking.
+ * See srcu_read_lock_raw() for more details.
+ */
+static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__srcu_read_unlock(sp, idx);
+	local_irq_restore(flags);
+}
+
 #endif
-- 
cgit v1.2.2


From 4145fa7fbee3ec1e61c52825b146192885d9759f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 31 Oct 2011 15:01:54 -0700
Subject: rcu: Deconfuse dynticks entry-exit tracing

The trace_rcu_dyntick() trace event did not print both the old and
the new value of the nesting level, and furthermore printed only
the low-order 32 bits of it.  This could result in some confusion
when interpreting trace-event dumps, so this commit prints both
the old and the new value, prints the full 64 bits, and also selects
the process-entry/exit increment to print nicely in hexadecimal.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/trace/events/rcu.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 172620a92b1..c29fb2f5590 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -246,21 +246,24 @@ TRACE_EVENT(rcu_fqs,
  */
 TRACE_EVENT(rcu_dyntick,
 
-	TP_PROTO(char *polarity, int nesting),
+	TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
 
-	TP_ARGS(polarity, nesting),
+	TP_ARGS(polarity, oldnesting, newnesting),
 
 	TP_STRUCT__entry(
 		__field(char *, polarity)
-		__field(int, nesting)
+		__field(long long, oldnesting)
+		__field(long long, newnesting)
 	),
 
 	TP_fast_assign(
 		__entry->polarity = polarity;
-		__entry->nesting = nesting;
+		__entry->oldnesting = oldnesting;
+		__entry->newnesting = newnesting;
 	),
 
-	TP_printk("%s %d", __entry->polarity, __entry->nesting)
+	TP_printk("%s %llx %llx", __entry->polarity,
+		  __entry->oldnesting, __entry->newnesting)
 );
 
 /*
@@ -470,7 +473,7 @@ TRACE_EVENT(rcu_torture_read,
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
 #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
-#define trace_rcu_dyntick(polarity, nesting) do { } while (0)
+#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
 #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
 #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
-- 
cgit v1.2.2


From c4f3060843506ba6d473ab9a0afe5bd5dc93a00d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 10 Nov 2011 12:41:56 -0800
Subject: sched: Add is_idle_task() to handle invalidated uses of idle_cpu()

Commit 908a3283 (Fix idle_cpu()) invalidated some uses of idle_cpu(),
which used to say whether or not the CPU was running the idle task,
but now instead says whether or not the CPU is running the idle task
in the absence of pending wakeups.  Although this new implementation
gives a better answer to the question "is this CPU idle?", it also
invalidates other uses that were made of idle_cpu().

This commit therefore introduces a new is_idle_task() API member
that determines whether or not the specified task is one of the
idle tasks, allowing open-coded "->pid == 0" sequences to be replaced
by something more meaningful.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/sched.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc..4a7e4d333a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2070,6 +2070,14 @@ extern int sched_setscheduler(struct task_struct *, int,
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
 				      const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
+/**
+ * is_idle_task - is the specified task an idle task?
+ * @tsk: the task in question.
+ */
+static inline bool is_idle_task(struct task_struct *p)
+{
+	return p->pid == 0;
+}
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
 
-- 
cgit v1.2.2


From 1268fbc746ea1cd279886a740dcbad4ba5232225 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 17 Nov 2011 18:48:14 +0100
Subject: nohz: Remove tick_nohz_idle_enter_norcu() /
 tick_nohz_idle_exit_norcu()

Those two APIs were provided to optimize the calls of
tick_nohz_idle_enter() and rcu_idle_enter() into a single
irq disabled section. This way no interrupt happening in-between would
needlessly process any RCU job.

Now we are talking about an optimization for which benefits
have yet to be measured. Let's start simple and completely decouple
idle rcu and dyntick idle logics to simplify.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/tick.h | 47 +----------------------------------------------
 1 file changed, 1 insertion(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 327434a0575..ab8be90b5cc 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -122,45 +122,8 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
-extern void __tick_nohz_idle_enter(void);
-static inline void tick_nohz_idle_enter(void)
-{
-	local_irq_disable();
-	__tick_nohz_idle_enter();
-	local_irq_enable();
-}
+extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
-
-/*
- * Call this pair of function if the arch doesn't make any use
- * of RCU in-between. You won't need to call rcu_idle_enter() and
- * rcu_idle_exit().
- * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit()
- * and explicitly tell RCU about the window around the place the CPU enters low
- * power mode where no RCU use is made. This is done by calling rcu_idle_enter()
- * after the last use of RCU before the CPU is put to sleep and by calling
- * rcu_idle_exit() before the first use of RCU after the CPU woke up.
- */
-static inline void tick_nohz_idle_enter_norcu(void)
-{
-	/*
-	 * Also call rcu_idle_enter() in the irq disabled section even
-	 * if it disables irq itself.
-	 * Just an optimization that prevents from an interrupt happening
-	 * between it and __tick_nohz_idle_enter() to lose time to help
-	 * completing a grace period while we could be in extended grace
-	 * period already.
-	 */
-	local_irq_disable();
-	__tick_nohz_idle_enter();
-	rcu_idle_enter();
-	local_irq_enable();
-}
-static inline void tick_nohz_idle_exit_norcu(void)
-{
-	rcu_idle_exit();
-	tick_nohz_idle_exit();
-}
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
@@ -168,14 +131,6 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
-static inline void tick_nohz_idle_enter_norcu(void)
-{
-	rcu_idle_enter();
-}
-static inline void tick_nohz_idle_exit_norcu(void)
-{
-	rcu_idle_exit();
-}
 
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
-- 
cgit v1.2.2


From 045fb9315a2129023d70a0eecf0942e18fca4fcd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 22 Nov 2011 12:13:03 -0800
Subject: rcu: Update trace_rcu_dyntick() header comment

This commit updates the trace_rcu_dyntick() header comment to reflect
events added by commit 4b4f421.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index c29fb2f5590..7f6877a3505 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -241,8 +241,16 @@ TRACE_EVENT(rcu_fqs,
 
 /*
  * Tracepoint for dyntick-idle entry/exit events.  These take a string
- * as argument: "Start" for entering dyntick-idle mode and "End" for
- * leaving it.
+ * as argument: "Start" for entering dyntick-idle mode, "End" for
+ * leaving it, "--=" for events moving towards idle, and "++=" for events
+ * moving away from idle.  "Error on entry: not idle task" and "Error on
+ * exit: not idle task" indicate that a non-idle task is erroneously
+ * toying with the idle loop.
+ *
+ * These events also take a pair of numbers, which indicate the nesting
+ * depth before and after the event of interest.  Note that task-related
+ * events use the upper bits of each number, while interrupt-related
+ * events use the lower bits.
  */
 TRACE_EVENT(rcu_dyntick,
 
-- 
cgit v1.2.2


From 433cdddcd9ac5558068edd7f8d4707a70f7710f5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 22 Nov 2011 14:58:03 -0800
Subject: rcu: Add tracing for RCU_FAST_NO_HZ

This commit adds trace_rcu_prep_idle(), which is invoked from
rcu_prepare_for_idle() and rcu_wake_cpu() to trace attempts on
the part of RCU to force CPUs into dyntick-idle mode.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 7f6877a3505..debe453c962 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -274,6 +274,42 @@ TRACE_EVENT(rcu_dyntick,
 		  __entry->oldnesting, __entry->newnesting)
 );
 
+/*
+ * Tracepoint for RCU preparation for idle, the goal being to get RCU
+ * processing done so that the current CPU can shut off its scheduling
+ * clock and enter dyntick-idle mode.  One way to accomplish this is
+ * to drain all RCU callbacks from this CPU, and the other is to have
+ * done everything RCU requires for the current grace period.  In this
+ * latter case, the CPU will be awakened at the end of the current grace
+ * period in order to process the remainder of its callbacks.
+ *
+ * These tracepoints take a string as argument:
+ *
+ *	"No callbacks": Nothing to do, no callbacks on this CPU.
+ *	"In holdoff": Nothing to do, holding off after unsuccessful attempt.
+ *	"Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU.
+ *	"Begin holdoff": Attempt failed, don't retry until next jiffy.
+ *	"More callbacks": Still more callbacks, try again to clear them out.
+ *	"Callbacks drained": All callbacks processed, off to dyntick idle!
+ *	"CPU awakened at GP end":
+ */
+TRACE_EVENT(rcu_prep_idle,
+
+	TP_PROTO(char *reason),
+
+	TP_ARGS(reason),
+
+	TP_STRUCT__entry(
+		__field(char *, reason)
+	),
+
+	TP_fast_assign(
+		__entry->reason = reason;
+	),
+
+	TP_printk("%s", __entry->reason)
+);
+
 /*
  * Tracepoint for the registration of a single RCU callback function.
  * The first argument is the type of RCU, the second argument is
@@ -482,6 +518,7 @@ TRACE_EVENT(rcu_torture_read,
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
 #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
 #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
+#define trace_rcu_prep_idle(reason) do { } while (0)
 #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
 #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
-- 
cgit v1.2.2


From f535a607c13c7b674e0788ca5765779aa74a01c3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 22 Nov 2011 20:43:02 -0800
Subject: rcu: Eliminate RCU_FAST_NO_HZ grace-period hang

With the new implementation of RCU_FAST_NO_HZ, it was possible to hang
RCU grace periods as follows:

o	CPU 0 attempts to go idle, cycles several times through the
	rcu_prepare_for_idle() loop, then goes dyntick-idle when
	RCU needs nothing more from it, while still having at least
	on RCU callback pending.

o	CPU 1 goes idle with no callbacks.

Both CPUs can then stay in dyntick-idle mode indefinitely, preventing
the RCU grace period from ever completing, possibly hanging the system.

This commit therefore prevents CPUs that have RCU callbacks from entering
dyntick-idle mode.  This approach also eliminates the need for the
end-of-grace-period IPIs used previously.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index debe453c962..8dd6fcb9494 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -287,7 +287,6 @@ TRACE_EVENT(rcu_dyntick,
  *
  *	"No callbacks": Nothing to do, no callbacks on this CPU.
  *	"In holdoff": Nothing to do, holding off after unsuccessful attempt.
- *	"Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU.
  *	"Begin holdoff": Attempt failed, don't retry until next jiffy.
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
-- 
cgit v1.2.2


From 3842a0832a1d6eb0b31421f8810a813135967512 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 10:42:42 -0800
Subject: rcu: Document same-context read-side constraints

The intent is that a given RCU read-side critical section be confined
to a single context.  For example, it is illegal to invoke rcu_read_lock()
in an exception handler and then invoke rcu_read_unlock() from the
context of the task that received the exception.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 15 +++++++++++++++
 include/linux/srcu.h     |  5 +++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5dd6fd8b320..81c04f4348e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -265,6 +265,11 @@ extern int debug_lockdep_rcu_enabled(void);
  *
  * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
  */
 static inline int rcu_read_lock_held(void)
 {
@@ -689,6 +694,11 @@ static inline void rcu_read_unlock(void)
  * critical sections in interrupt context can use just rcu_read_lock(),
  * though this should at least be commented to avoid confusing people
  * reading the code.
+ *
+ * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
+ * must occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
+ * was invoked from some other task.
  */
 static inline void rcu_read_lock_bh(void)
 {
@@ -716,6 +726,11 @@ static inline void rcu_read_unlock_bh(void)
  * are being done using call_rcu_sched() or synchronize_rcu_sched().
  * Read-side critical sections can also be introduced by anything that
  * disables preemption, including local_irq_disable() and friends.
+ *
+ * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
+ * must occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock_sched() from process context if the matching
+ * rcu_read_lock_sched() was invoked from an NMI handler.
  */
 static inline void rcu_read_lock_sched(void)
 {
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 1eb520cd168..e1b005918bb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -158,6 +158,11 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  * one way to indirectly wait on an SRCU grace period is to acquire
  * a mutex that is held elsewhere while calling synchronize_srcu() or
  * synchronize_srcu_expedited().
+ *
+ * Note that srcu_read_lock() and the matching srcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
+ * was invoked in process context.
  */
 static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 {
-- 
cgit v1.2.2


From 7cb92499000e3c86dae653077b1465458a039ef6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 28 Nov 2011 12:28:34 -0800
Subject: rcu: Permit dyntick-idle with callbacks pending

The current implementation of RCU_FAST_NO_HZ prevents CPUs from entering
dyntick-idle state if they have RCU callbacks pending.  Unfortunately,
this has the side-effect of often preventing them from entering this
state, especially if at least one other CPU is not in dyntick-idle state.
However, the resulting per-tick wakeup is wasteful in many cases: if the
CPU has already fully responded to the current RCU grace period, there
will be nothing for it to do until this grace period ends, which will
frequently take several jiffies.

This commit therefore permits a CPU that has done everything that the
current grace period has asked of it (rcu_pending() == 0) even if it
still as RCU callbacks pending.  However, such a CPU posts a timer to
wake it up several jiffies later (6 jiffies, based on experience with
grace-period lengths).  This wakeup is required to handle situations
that can result in all CPUs being in dyntick-idle mode, thus failing
to ever complete the current grace period.  If a CPU wakes up before
the timer goes off, then it cancels that timer, thus avoiding spurious
wakeups.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 8dd6fcb9494..c75418c3ccb 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -288,9 +288,10 @@ TRACE_EVENT(rcu_dyntick,
  *	"No callbacks": Nothing to do, no callbacks on this CPU.
  *	"In holdoff": Nothing to do, holding off after unsuccessful attempt.
  *	"Begin holdoff": Attempt failed, don't retry until next jiffy.
+ *	"Dyntick with callbacks": Entering dyntick-idle despite callbacks.
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
- *	"CPU awakened at GP end":
+ *	"Timer": Timer fired to cause CPU to continue processing callbacks.
  */
 TRACE_EVENT(rcu_prep_idle,
 
-- 
cgit v1.2.2


From 2987557f52b97f679f0c324d8f51b8d66e1f2084 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Sat, 3 Dec 2011 13:06:50 -0800
Subject: driver-core/cpu: Expose hotpluggability to the rest of the kernel

When architectures register CPUs, they indicate whether the CPU allows
hotplugging; notably, x86 and ARM don't allow hotplugging CPU 0.
Userspace can easily query the hotpluggability of a CPU via sysfs;
however, the kernel has no convenient way of accessing that property in
an architecture-independent way.  While the kernel can simply try it and
see, some code needs to distinguish between "hotplug failed" and
"hotplug has no hope of working on this CPU"; for example, rcutorture's
CPU hotplug tests want to avoid drowning out real hotplug failures with
expected failures.

Expose this property via a new cpu_is_hotpluggable function, so that the
rest of the kernel can access it in an architecture-independent way.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/cpu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6cb60fd2ea8..305c263021e 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -27,6 +27,7 @@ struct cpu {
 
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
+extern bool cpu_is_hotpluggable(unsigned cpu);
 
 extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
 extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
-- 
cgit v1.2.2


From 4968c300e1fa5389fdf1f1ebd8b8e4aec9aa4a9e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 7 Dec 2011 16:32:40 -0800
Subject: rcu: Augment rcu_batch_end tracing for idle and callback state

The current rcu_batch_end event trace records only the name of the RCU
flavor and the total number of callbacks that remain queued on the
current CPU.  This is insufficient for testing and tuning the new
dyntick-idle RCU_FAST_NO_HZ code, so this commit adds idle state along
with whether or not any of the callbacks that were ready to invoke
at the beginning of rcu_do_batch() are still queued.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index c75418c3ccb..d2d88bed891 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -461,27 +461,46 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
 
 /*
  * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
- * invoked.  The first argument is the name of the RCU flavor and
- * the second argument is number of callbacks actually invoked.
+ * invoked.  The first argument is the name of the RCU flavor,
+ * the second argument is number of callbacks actually invoked,
+ * the third argument (cb) is whether or not any of the callbacks that
+ * were ready to invoke at the beginning of this batch are still
+ * queued, the fourth argument (nr) is the return value of need_resched(),
+ * the fifth argument (iit) is 1 if the current task is the idle task,
+ * and the sixth argument (risk) is the return value from
+ * rcu_is_callbacks_kthread().
  */
 TRACE_EVENT(rcu_batch_end,
 
-	TP_PROTO(char *rcuname, int callbacks_invoked),
+	TP_PROTO(char *rcuname, int callbacks_invoked,
+		 bool cb, bool nr, bool iit, bool risk),
 
-	TP_ARGS(rcuname, callbacks_invoked),
+	TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
 
 	TP_STRUCT__entry(
 		__field(char *, rcuname)
 		__field(int, callbacks_invoked)
+		__field(bool, cb)
+		__field(bool, nr)
+		__field(bool, iit)
+		__field(bool, risk)
 	),
 
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->callbacks_invoked = callbacks_invoked;
-	),
-
-	TP_printk("%s CBs-invoked=%d",
-		  __entry->rcuname, __entry->callbacks_invoked)
+		__entry->cb = cb;
+		__entry->nr = nr;
+		__entry->iit = iit;
+		__entry->risk = risk;
+	),
+
+	TP_printk("%s CBs-invoked=%d idle=%c%c%c%c",
+		  __entry->rcuname, __entry->callbacks_invoked,
+		  __entry->cb ? 'C' : '.',
+		  __entry->nr ? 'S' : '.',
+		  __entry->iit ? 'I' : '.',
+		  __entry->risk ? 'R' : '.')
 );
 
 /*
@@ -524,7 +543,8 @@ TRACE_EVENT(rcu_torture_read,
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
 #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
 #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
-#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
+#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
+	do { } while (0)
 #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
-- 
cgit v1.2.2