aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-03-16 16:54:24 -0400
committerTejun Heo <tj@kernel.org>2017-03-17 10:18:47 -0400
commit77f88796cee819b9c4562b0b6b44691b3b7755b1 (patch)
treebbb9d1150a48c4634897638f3cb273799f6c135c
parentb6a6759daf55dade2b65089957832759d502acfb (diff)
cgroup, kthread: close race window where new kthreads can be migrated to non-root cgroups
Creation of a kthread goes through a couple interlocked stages between the kthread itself and its creator. Once the new kthread starts running, it initializes itself and wakes up the creator. The creator then can further configure the kthread and then let it start doing its job by waking it up. In this configuration-by-creator stage, the creator is the only one that can wake it up but the kthread is visible to userland. When altering the kthread's attributes from userland is allowed, this is fine; however, for cases where CPU affinity is critical, kthread_bind() is used to first disable affinity changes from userland and then set the affinity. This also prevents the kthread from being migrated into non-root cgroups as that can affect the CPU affinity and many other things. Unfortunately, the cgroup side of protection is racy. While the PF_NO_SETAFFINITY flag prevents further migrations, userland can win the race before the creator sets the flag with kthread_bind() and put the kthread in a non-root cgroup, which can lead to all sorts of problems including incorrect CPU affinity and starvation. This bug got triggered by userland which periodically tries to migrate all processes in the root cpuset cgroup to a non-root one. Per-cpu workqueue workers got caught while being created and ended up with incorrected CPU affinity breaking concurrency management and sometimes stalling workqueue execution. This patch adds task->no_cgroup_migration which disallows the task to be migrated by userland. kthreadd starts with the flag set making every child kthread start in the root cgroup with migration disallowed. The flag is cleared after the kthread finishes initialization by which time PF_NO_SETAFFINITY is set if the kthread should stay in the root cgroup. It'd be better to wait for the initialization instead of failing but I couldn't think of a way of implementing that without adding either a new PF flag, or sleeping and retrying from waiting side. Even if userland depends on changing cgroup membership of a kthread, it either has to be synchronized with kthread_create() or periodically repeat, so it's unlikely that this would break anything. v2: Switch to a simpler implementation using a new task_struct bit field suggested by Oleg. Signed-off-by: Tejun Heo <tj@kernel.org> Suggested-by: Oleg Nesterov <oleg@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Reported-and-debugged-by: Chris Mason <clm@fb.com> Cc: stable@vger.kernel.org # v4.3+ (we can't close the race on < v4.3) Signed-off-by: Tejun Heo <tj@kernel.org>
-rw-r--r--include/linux/cgroup.h21
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/cgroup/cgroup.c9
-rw-r--r--kernel/kthread.c3
4 files changed, 33 insertions, 4 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fbb141c..af9c86e958bd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
570 pr_cont_kernfs_path(cgrp->kn); 570 pr_cont_kernfs_path(cgrp->kn);
571} 571}
572 572
573static inline void cgroup_init_kthreadd(void)
574{
575 /*
576 * kthreadd is inherited by all kthreads, keep it in the root so
577 * that the new kthreads are guaranteed to stay in the root until
578 * initialization is finished.
579 */
580 current->no_cgroup_migration = 1;
581}
582
583static inline void cgroup_kthread_ready(void)
584{
585 /*
586 * This kthread finished initialization. The creator should have
587 * set PF_NO_SETAFFINITY if this kthread should stay in the root.
588 */
589 current->no_cgroup_migration = 0;
590}
591
573#else /* !CONFIG_CGROUPS */ 592#else /* !CONFIG_CGROUPS */
574 593
575struct cgroup_subsys_state; 594struct cgroup_subsys_state;
@@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {}
590 609
591static inline int cgroup_init_early(void) { return 0; } 610static inline int cgroup_init_early(void) { return 0; }
592static inline int cgroup_init(void) { return 0; } 611static inline int cgroup_init(void) { return 0; }
612static inline void cgroup_init_kthreadd(void) {}
613static inline void cgroup_kthread_ready(void) {}
593 614
594static inline bool task_under_cgroup_hierarchy(struct task_struct *task, 615static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
595 struct cgroup *ancestor) 616 struct cgroup *ancestor)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..4cf9a59a4d08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -604,6 +604,10 @@ struct task_struct {
604#ifdef CONFIG_COMPAT_BRK 604#ifdef CONFIG_COMPAT_BRK
605 unsigned brk_randomized:1; 605 unsigned brk_randomized:1;
606#endif 606#endif
607#ifdef CONFIG_CGROUPS
608 /* disallow userland-initiated cgroup migration */
609 unsigned no_cgroup_migration:1;
610#endif
607 611
608 unsigned long atomic_flags; /* Flags requiring atomic access. */ 612 unsigned long atomic_flags; /* Flags requiring atomic access. */
609 613
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..638ef7568495 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2425 tsk = tsk->group_leader; 2425 tsk = tsk->group_leader;
2426 2426
2427 /* 2427 /*
2428 * Workqueue threads may acquire PF_NO_SETAFFINITY and become 2428 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2429 * trapped in a cpuset, or RT worker may be born in a cgroup 2429 * If userland migrates such a kthread to a non-root cgroup, it can
2430 * with no rt_runtime allocated. Just say no. 2430 * become trapped in a cpuset, or RT kthread may be born in a
2431 * cgroup with no rt_runtime allocated. Just say no.
2431 */ 2432 */
2432 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { 2433 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2433 ret = -EINVAL; 2434 ret = -EINVAL;
2434 goto out_unlock_rcu; 2435 goto out_unlock_rcu;
2435 } 2436 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2f26adea0f84..26db528c1d88 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/cgroup.h>
23#include <trace/events/sched.h> 24#include <trace/events/sched.h>
24 25
25static DEFINE_SPINLOCK(kthread_create_lock); 26static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
225 226
226 ret = -EINTR; 227 ret = -EINTR;
227 if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { 228 if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
229 cgroup_kthread_ready();
228 __kthread_parkme(self); 230 __kthread_parkme(self);
229 ret = threadfn(data); 231 ret = threadfn(data);
230 } 232 }
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
538 set_mems_allowed(node_states[N_MEMORY]); 540 set_mems_allowed(node_states[N_MEMORY]);
539 541
540 current->flags |= PF_NOFREEZE; 542 current->flags |= PF_NOFREEZE;
543 cgroup_init_kthreadd();
541 544
542 for (;;) { 545 for (;;) {
543 set_current_state(TASK_INTERRUPTIBLE); 546 set_current_state(TASK_INTERRUPTIBLE);