diff options
author | Oleg Nesterov <oleg@redhat.com> | 2016-05-18 13:02:18 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-06-03 03:18:57 -0400 |
commit | 150593bf869393d10a79f6bd3df2585ecc20a9bb (patch) | |
tree | cbe9c8bbf903315c0b07397f18a6a97294bab0e7 | |
parent | df55f462b905f3b2d40ec3fb865891382a6ebfb1 (diff) |
sched/api: Introduce task_rcu_dereference() and try_get_task_struct()
Generally task_struct is only protected by RCU if it was found on a
RCU protected list (say, for_each_process() or find_task_by_vpid()).
As Kirill pointed out rq->curr isn't protected by RCU, the scheduler
drops the (potentially) last reference without RCU gp, this means
that we need to fix the code which uses foreign_rq->curr under
rcu_read_lock().
Add a new helper which can be used to dereference rq->curr or any
other pointer to task_struct assuming that it should be cleared or
updated before the final put_task_struct(). It returns non-NULL
only if this task can't go away before rcu_read_unlock().
( Also add try_get_task_struct() to make it easier to use this API
correctly. )
Suggested-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[ Updated comments; added try_get_task_struct()]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/exit.c | 76 |
2 files changed, 79 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..dee41bf59e6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t) | |||
2139 | __put_task_struct(t); | 2139 | __put_task_struct(t); |
2140 | } | 2140 | } |
2141 | 2141 | ||
2142 | struct task_struct *task_rcu_dereference(struct task_struct **ptask); | ||
2143 | struct task_struct *try_get_task_struct(struct task_struct **ptask); | ||
2144 | |||
2142 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 2145 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
2143 | extern void task_cputime(struct task_struct *t, | 2146 | extern void task_cputime(struct task_struct *t, |
2144 | cputime_t *utime, cputime_t *stime); | 2147 | cputime_t *utime, cputime_t *stime); |
diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -211,6 +211,82 @@ repeat: | |||
211 | } | 211 | } |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * Note that if this function returns a valid task_struct pointer (!NULL) | ||
215 | * task->usage must remain >0 for the duration of the RCU critical section. | ||
216 | */ | ||
217 | struct task_struct *task_rcu_dereference(struct task_struct **ptask) | ||
218 | { | ||
219 | struct sighand_struct *sighand; | ||
220 | struct task_struct *task; | ||
221 | |||
222 | /* | ||
223 | * We need to verify that release_task() was not called and thus | ||
224 | * delayed_put_task_struct() can't run and drop the last reference | ||
225 | * before rcu_read_unlock(). We check task->sighand != NULL, | ||
226 | * but we can read the already freed and reused memory. | ||
227 | */ | ||
228 | retry: | ||
229 | task = rcu_dereference(*ptask); | ||
230 | if (!task) | ||
231 | return NULL; | ||
232 | |||
233 | probe_kernel_address(&task->sighand, sighand); | ||
234 | |||
235 | /* | ||
236 | * Pairs with atomic_dec_and_test() in put_task_struct(). If this task | ||
237 | * was already freed we can not miss the preceding update of this | ||
238 | * pointer. | ||
239 | */ | ||
240 | smp_rmb(); | ||
241 | if (unlikely(task != READ_ONCE(*ptask))) | ||
242 | goto retry; | ||
243 | |||
244 | /* | ||
245 | * We've re-checked that "task == *ptask", now we have two different | ||
246 | * cases: | ||
247 | * | ||
248 | * 1. This is actually the same task/task_struct. In this case | ||
249 | * sighand != NULL tells us it is still alive. | ||
250 | * | ||
251 | * 2. This is another task which got the same memory for task_struct. | ||
252 | * We can't know this of course, and we can not trust | ||
253 | * sighand != NULL. | ||
254 | * | ||
255 | * In this case we actually return a random value, but this is | ||
256 | * correct. | ||
257 | * | ||
258 | * If we return NULL - we can pretend that we actually noticed that | ||
259 | * *ptask was updated when the previous task has exited. Or pretend | ||
260 | * that probe_slab_address(&sighand) reads NULL. | ||
261 | * | ||
262 | * If we return the new task (because sighand is not NULL for any | ||
263 | * reason) - this is fine too. This (new) task can't go away before | ||
264 | * another gp pass. | ||
265 | * | ||
266 | * And note: We could even eliminate the false positive if re-read | ||
267 | * task->sighand once again to avoid the falsely NULL. But this case | ||
268 | * is very unlikely so we don't care. | ||
269 | */ | ||
270 | if (!sighand) | ||
271 | return NULL; | ||
272 | |||
273 | return task; | ||
274 | } | ||
275 | |||
276 | struct task_struct *try_get_task_struct(struct task_struct **ptask) | ||
277 | { | ||
278 | struct task_struct *task; | ||
279 | |||
280 | rcu_read_lock(); | ||
281 | task = task_rcu_dereference(ptask); | ||
282 | if (task) | ||
283 | get_task_struct(task); | ||
284 | rcu_read_unlock(); | ||
285 | |||
286 | return task; | ||
287 | } | ||
288 | |||
289 | /* | ||
214 | * Determine if a process group is "orphaned", according to the POSIX | 290 | * Determine if a process group is "orphaned", according to the POSIX |
215 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 291 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
216 | * by terminal-generated stop signals. Newly orphaned process groups are | 292 | * by terminal-generated stop signals. Newly orphaned process groups are |