sched: Implement lockless wake-queues

This is useful for locking primitives that can effect multiple wakeups per operation and want to avoid lock internal lock contention by delaying the wakeups until we've released the lock internal locks. Alternatively it can be used to avoid issuing multiple wakeups, and thus save a few cycles, in packet processing. Queue all target tasks and wakeup once you've processed all packets. That way you avoid waking the target task multiple times if there were multiple packets for the same task. Properties of a wake_q are: - Lockless, as queue head must reside on the stack. - Being a queue, maintains wakeup order passed by the callers. This can be important for otherwise, in scenarios where highly contended locks could affect any reliance on lock fairness. - A queued task cannot be added again until it is woken up. This patch adds the needed infrastructure into the scheduler code and uses the new wake_list to delay the futex wakeups until after we've released the hash bucket locks. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> [tweaks, adjustments, comments, etc.] Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Chris Mason <clm@fb.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: George Spelvin <linux@horizon.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Manfred Spraul <manfred@colorfullife.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2015-05-01 11:27:50 -0400
committer: Ingo Molnar <mingo@kernel.org> 2015-05-08 06:20:45 -0400
commit: 7675104990ed255b9315a82ae827ff312a2a88a2 (patch)
tree: c0ad064ff6e7ee2e15132caeece2851f6ce2bd4f /include/linux/sched.h
parent: 7110744516276e906f9197e2857d026eb2343393 (diff)
1 files changed, 46 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4adc536a3b03..254d88e80f65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -921,6 +921,50 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SCALE    (1L << SCHED_CAPACITY_SHIFT)
 /*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+        struct wake_q_node *next;
+};
+struct wake_q_head {
+        struct wake_q_node *first;
+        struct wake_q_node **lastp;
+};
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+#define WAKE_Q(name)                                    \
+        struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+extern void wake_q_add(struct wake_q_head *head,
+                       struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+/*
 * sched-domains (multiprocessor balancing) declarations:
 */
 #ifdef CONFIG_SMP
@@ -1532,6 +1576,8 @@ struct task_struct {
        /* Protection of the PI data structures: */
        raw_spinlock_t pi_lock;
+        struct wake_q_node wake_q;
 #ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task */
        struct rb_root pi_waiters;
author	Peter Zijlstra <peterz@infradead.org>	2015-05-01 11:27:50 -0400
committer	Ingo Molnar <mingo@kernel.org>	2015-05-08 06:20:45 -0400
commit	7675104990ed255b9315a82ae827ff312a2a88a2 (patch)
tree	c0ad064ff6e7ee2e15132caeece2851f6ce2bd4f /include/linux/sched.h
parent	7110744516276e906f9197e2857d026eb2343393 (diff)

diff --git a/include/linux/sched.h b/include/linux/sched.h index 4adc536a3b03..254d88e80f65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -921,6 +921,50 @@ enum cpu_idle_type {
921	#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)	921	#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
922		922
923	/*	923	/*
		924	* Wake-queues are lists of tasks with a pending wakeup, whose
		925	* callers have already marked the task as woken internally,
		926	* and can thus carry on. A common use case is being able to
		927	* do the wakeups once the corresponding user lock as been
		928	* released.
		929	*
		930	* We hold reference to each task in the list across the wakeup,
		931	* thus guaranteeing that the memory is still valid by the time
		932	* the actual wakeups are performed in wake_up_q().
		933	*
		934	* One per task suffices, because there's never a need for a task to be
		935	* in two wake queues simultaneously; it is forbidden to abandon a task
		936	* in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
		937	* already in a wake queue, the wakeup will happen soon and the second
		938	* waker can just skip it.
		939	*
		940	* The WAKE_Q macro declares and initializes the list head.
		941	* wake_up_q() does NOT reinitialize the list; it's expected to be
		942	* called near the end of a function, where the fact that the queue is
		943	* not used again will be easy to see by inspection.
		944	*
		945	* Note that this can cause spurious wakeups. schedule() callers
		946	* must ensure the call is done inside a loop, confirming that the
		947	* wakeup condition has in fact occurred.
		948	*/
		949	struct wake_q_node {
		950	struct wake_q_node *next;
		951	};
		952
		953	struct wake_q_head {
		954	struct wake_q_node *first;
		955	struct wake_q_node **lastp;
		956	};
		957
		958	#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
		959
		960	#define WAKE_Q(name) \
		961	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
		962
		963	extern void wake_q_add(struct wake_q_head *head,
		964	struct task_struct *task);
		965	extern void wake_up_q(struct wake_q_head *head);
		966
		967	/*
924	* sched-domains (multiprocessor balancing) declarations:	968	* sched-domains (multiprocessor balancing) declarations:
925	*/	969	*/
926	#ifdef CONFIG_SMP	970	#ifdef CONFIG_SMP
@@ -1532,6 +1576,8 @@ struct task_struct {
1532	/* Protection of the PI data structures: */	1576	/* Protection of the PI data structures: */
1533	raw_spinlock_t pi_lock;	1577	raw_spinlock_t pi_lock;
1534		1578
		1579	struct wake_q_node wake_q;
		1580
1535	#ifdef CONFIG_RT_MUTEXES	1581	#ifdef CONFIG_RT_MUTEXES
1536	/* PI waiters blocked on a rt_mutex held by this task */	1582	/* PI waiters blocked on a rt_mutex held by this task */
1537	struct rb_root pi_waiters;	1583	struct rb_root pi_waiters;