sched/api: Introduce task_rcu_dereference() and try_get_task_struct()

Generally task_struct is only protected by RCU if it was found on a RCU protected list (say, for_each_process() or find_task_by_vpid()). As Kirill pointed out rq->curr isn't protected by RCU, the scheduler drops the (potentially) last reference without RCU gp, this means that we need to fix the code which uses foreign_rq->curr under rcu_read_lock(). Add a new helper which can be used to dereference rq->curr or any other pointer to task_struct assuming that it should be cleared or updated before the final put_task_struct(). It returns non-NULL only if this task can't go away before rcu_read_unlock(). ( Also add try_get_task_struct() to make it easier to use this API correctly. ) Suggested-by: Kirill Tkhai <ktkhai@parallels.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> [ Updated comments; added try_get_task_struct()] Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vladimir Davydov <vdavydov@parallels.com> Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Oleg Nesterov <oleg@redhat.com> 2016-05-18 13:02:18 -0400
committer: Ingo Molnar <mingo@kernel.org> 2016-06-03 03:18:57 -0400
commit: 150593bf869393d10a79f6bd3df2585ecc20a9bb (patch)
tree: cbe9c8bbf903315c0b07397f18a6a97294bab0e7
parent: df55f462b905f3b2d40ec3fb865891382a6ebfb1 (diff)
2 files changed, 79 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..dee41bf59e6b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t)
                __put_task_struct(t);
 }
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
                         cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..2fb4d44c51b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
 }
 /*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+        struct sighand_struct *sighand;
+        struct task_struct *task;
+        /*
+         * We need to verify that release_task() was not called and thus
+         * delayed_put_task_struct() can't run and drop the last reference
+         * before rcu_read_unlock(). We check task->sighand != NULL,
+         * but we can read the already freed and reused memory.
+         */
+retry:
+        task = rcu_dereference(*ptask);
+        if (!task)
+                return NULL;
+        probe_kernel_address(&task->sighand, sighand);
+        /*
+         * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+         * was already freed we can not miss the preceding update of this
+         * pointer.
+         */
+        smp_rmb();
+        if (unlikely(task != READ_ONCE(*ptask)))
+                goto retry;
+        /*
+         * We've re-checked that "task == *ptask", now we have two different
+         * cases:
+         *
+         * 1. This is actually the same task/task_struct. In this case
+         *    sighand != NULL tells us it is still alive.
+         *
+         * 2. This is another task which got the same memory for task_struct.
+         *    We can't know this of course, and we can not trust
+         *    sighand != NULL.
+         *
+         *    In this case we actually return a random value, but this is
+         *    correct.
+         *
+         *    If we return NULL - we can pretend that we actually noticed that
+         *    *ptask was updated when the previous task has exited. Or pretend
+         *    that probe_slab_address(&sighand) reads NULL.
+         *
+         *    If we return the new task (because sighand is not NULL for any
+         *    reason) - this is fine too. This (new) task can't go away before
+         *    another gp pass.
+         *
+         *    And note: We could even eliminate the false positive if re-read
+         *    task->sighand once again to avoid the falsely NULL. But this case
+         *    is very unlikely so we don't care.
+         */
+        if (!sighand)
+                return NULL;
+        return task;
+}
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+        struct task_struct *task;
+        rcu_read_lock();
+        task = task_rcu_dereference(ptask);
+        if (task)
+                get_task_struct(task);
+        rcu_read_unlock();
+        return task;
+}
+/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
author	Oleg Nesterov <oleg@redhat.com>	2016-05-18 13:02:18 -0400
committer	Ingo Molnar <mingo@kernel.org>	2016-06-03 03:18:57 -0400
commit	150593bf869393d10a79f6bd3df2585ecc20a9bb (patch)
tree	cbe9c8bbf903315c0b07397f18a6a97294bab0e7
parent	df55f462b905f3b2d40ec3fb865891382a6ebfb1 (diff)

diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..dee41bf59e6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t)
2139	__put_task_struct(t);	2139	__put_task_struct(t);
2140	}	2140	}
2141		2141
		2142	struct task_struct task_rcu_dereference(struct task_struct *ptask);
		2143	struct task_struct try_get_task_struct(struct task_struct *ptask);
		2144
2142	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN	2145	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2143	extern void task_cputime(struct task_struct *t,	2146	extern void task_cputime(struct task_struct *t,
2144	cputime_t utime, cputime_t stime);	2147	cputime_t utime, cputime_t stime);


diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
211	}	211	}
212		212
213	/*	213	/*
		214	* Note that if this function returns a valid task_struct pointer (!NULL)
		215	* task->usage must remain >0 for the duration of the RCU critical section.
		216	*/
		217	struct task_struct task_rcu_dereference(struct task_struct *ptask)
		218	{
		219	struct sighand_struct *sighand;
		220	struct task_struct *task;
		221
		222	/*
		223	* We need to verify that release_task() was not called and thus
		224	* delayed_put_task_struct() can't run and drop the last reference
		225	* before rcu_read_unlock(). We check task->sighand != NULL,
		226	* but we can read the already freed and reused memory.
		227	*/
		228	retry:
		229	task = rcu_dereference(*ptask);
		230	if (!task)
		231	return NULL;
		232
		233	probe_kernel_address(&task->sighand, sighand);
		234
		235	/*
		236	* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
		237	* was already freed we can not miss the preceding update of this
		238	* pointer.
		239	*/
		240	smp_rmb();
		241	if (unlikely(task != READ_ONCE(*ptask)))
		242	goto retry;
		243
		244	/*
		245	* We've re-checked that "task == *ptask", now we have two different
		246	* cases:
		247	*
		248	* 1. This is actually the same task/task_struct. In this case
		249	* sighand != NULL tells us it is still alive.
		250	*
		251	* 2. This is another task which got the same memory for task_struct.
		252	* We can't know this of course, and we can not trust
		253	* sighand != NULL.
		254	*
		255	* In this case we actually return a random value, but this is
		256	* correct.
		257	*
		258	* If we return NULL - we can pretend that we actually noticed that
		259	* *ptask was updated when the previous task has exited. Or pretend
		260	* that probe_slab_address(&sighand) reads NULL.
		261	*
		262	* If we return the new task (because sighand is not NULL for any
		263	* reason) - this is fine too. This (new) task can't go away before
		264	* another gp pass.
		265	*
		266	* And note: We could even eliminate the false positive if re-read
		267	* task->sighand once again to avoid the falsely NULL. But this case
		268	* is very unlikely so we don't care.
		269	*/
		270	if (!sighand)
		271	return NULL;
		272
		273	return task;
		274	}
		275
		276	struct task_struct try_get_task_struct(struct task_struct *ptask)
		277	{
		278	struct task_struct *task;
		279
		280	rcu_read_lock();
		281	task = task_rcu_dereference(ptask);
		282	if (task)
		283	get_task_struct(task);
		284	rcu_read_unlock();
		285
		286	return task;
		287	}
		288
		289	/*
214	* Determine if a process group is "orphaned", according to the POSIX	290	* Determine if a process group is "orphaned", according to the POSIX
215	* definition in 2.2.2.52. Orphaned process groups are not to be affected	291	* definition in 2.2.2.52. Orphaned process groups are not to be affected
216	* by terminal-generated stop signals. Newly orphaned process groups are	292	* by terminal-generated stop signals. Newly orphaned process groups are