aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2016-06-16 07:29:28 -0400
committerIngo Molnar <mingo@kernel.org>2016-06-27 06:17:53 -0400
commit7dc603c9028ea5d4354e0e317e8481df99b06d7e (patch)
treed0bb30751e4e0a13b4bdb2af705eb329378c99e2
parentea86cb4b7621e1298a37197005bf0abcc86348d4 (diff)
sched/fair: Fix PELT integrity for new tasks
Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. Reported-by: Yuyang Du <yuyang.du@intel.com> Reported-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sched/fair.c45
3 files changed, 63 insertions, 15 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b45acfd18f4e..d99218a1e043 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
219#define TASK_WAKING 256 219#define TASK_WAKING 256
220#define TASK_PARKED 512 220#define TASK_PARKED 512
221#define TASK_NOLOAD 1024 221#define TASK_NOLOAD 1024
222#define TASK_STATE_MAX 2048 222#define TASK_NEW 2048
223#define TASK_STATE_MAX 4096
223 224
224#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" 225#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
225 226
226extern char ___assert_task_state[1 - 2*!!( 227extern char ___assert_task_state[1 - 2*!!(
227 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; 228 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d856c46f6d8..14afa518948c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2342 2342
2343 __sched_fork(clone_flags, p); 2343 __sched_fork(clone_flags, p);
2344 /* 2344 /*
2345 * We mark the process as running here. This guarantees that 2345 * We mark the process as NEW here. This guarantees that
2346 * nobody will actually run it, and a signal or other external 2346 * nobody will actually run it, and a signal or other external
2347 * event cannot wake it up and insert it on the runqueue either. 2347 * event cannot wake it up and insert it on the runqueue either.
2348 */ 2348 */
2349 p->state = TASK_RUNNING; 2349 p->state = TASK_NEW;
2350 2350
2351 /* 2351 /*
2352 * Make sure we do not leak PI boosting priority to the child. 2352 * Make sure we do not leak PI boosting priority to the child.
@@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2383 p->sched_class = &fair_sched_class; 2383 p->sched_class = &fair_sched_class;
2384 } 2384 }
2385 2385
2386 init_entity_runnable_average(&p->se);
2387
2386 /* 2388 /*
2387 * The child is not yet in the pid-hash so no cgroup attach races, 2389 * The child is not yet in the pid-hash so no cgroup attach races,
2388 * and the cgroup is pinned to this child due to cgroup_fork() 2390 * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p)
2529 struct rq_flags rf; 2531 struct rq_flags rf;
2530 struct rq *rq; 2532 struct rq *rq;
2531 2533
2532 /* Initialize new task's runnable average */
2533 init_entity_runnable_average(&p->se);
2534 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2534 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2535 p->state = TASK_RUNNING;
2535#ifdef CONFIG_SMP 2536#ifdef CONFIG_SMP
2536 /* 2537 /*
2537 * Fork balancing, do it here and not earlier because: 2538 * Fork balancing, do it here and not earlier because:
@@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8237{ 8238{
8238 struct task_struct *task; 8239 struct task_struct *task;
8239 struct cgroup_subsys_state *css; 8240 struct cgroup_subsys_state *css;
8241 int ret = 0;
8240 8242
8241 cgroup_taskset_for_each(task, css, tset) { 8243 cgroup_taskset_for_each(task, css, tset) {
8242#ifdef CONFIG_RT_GROUP_SCHED 8244#ifdef CONFIG_RT_GROUP_SCHED
@@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8247 if (task->sched_class != &fair_sched_class) 8249 if (task->sched_class != &fair_sched_class)
8248 return -EINVAL; 8250 return -EINVAL;
8249#endif 8251#endif
8252 /*
8253 * Serialize against wake_up_new_task() such that if its
8254 * running, we're sure to observe its full state.
8255 */
8256 raw_spin_lock_irq(&task->pi_lock);
8257 /*
8258 * Avoid calling sched_move_task() before wake_up_new_task()
8259 * has happened. This would lead to problems with PELT, due to
8260 * move wanting to detach+attach while we're not attached yet.
8261 */
8262 if (task->state == TASK_NEW)
8263 ret = -EINVAL;
8264 raw_spin_unlock_irq(&task->pi_lock);
8265
8266 if (ret)
8267 break;
8250 } 8268 }
8251 return 0; 8269 return ret;
8252} 8270}
8253 8271
8254static void cpu_cgroup_attach(struct cgroup_taskset *tset) 8272static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 64f26bc436eb..0c21a12c0205 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se)
690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
691} 691}
692 692
693static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
694static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
695static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
696
693/* 697/*
694 * With new tasks being created, their initial util_avgs are extrapolated 698 * With new tasks being created, their initial util_avgs are extrapolated
695 * based on the cfs_rq's current util_avg: 699 * based on the cfs_rq's current util_avg:
@@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
720 struct cfs_rq *cfs_rq = cfs_rq_of(se); 724 struct cfs_rq *cfs_rq = cfs_rq_of(se);
721 struct sched_avg *sa = &se->avg; 725 struct sched_avg *sa = &se->avg;
722 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 726 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
727 u64 now = cfs_rq_clock_task(cfs_rq);
723 728
724 if (cap > 0) { 729 if (cap > 0) {
725 if (cfs_rq->avg.util_avg != 0) { 730 if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se)
733 } 738 }
734 sa->util_sum = sa->util_avg * LOAD_AVG_MAX; 739 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
735 } 740 }
741
742 if (entity_is_task(se)) {
743 struct task_struct *p = task_of(se);
744 if (p->sched_class != &fair_sched_class) {
745 /*
746 * For !fair tasks do:
747 *
748 update_cfs_rq_load_avg(now, cfs_rq, false);
749 attach_entity_load_avg(cfs_rq, se);
750 switched_from_fair(rq, p);
751 *
752 * such that the next switched_to_fair() has the
753 * expected state.
754 */
755 se->avg.last_update_time = now;
756 return;
757 }
758 }
759
760 update_cfs_rq_load_avg(now, cfs_rq, false);
761 attach_entity_load_avg(cfs_rq, se);
736} 762}
737 763
738#else 764#else /* !CONFIG_SMP */
739void init_entity_runnable_average(struct sched_entity *se) 765void init_entity_runnable_average(struct sched_entity *se)
740{ 766{
741} 767}
742void post_init_entity_util_avg(struct sched_entity *se) 768void post_init_entity_util_avg(struct sched_entity *se)
743{ 769{
744} 770}
745#endif 771#endif /* CONFIG_SMP */
746 772
747/* 773/*
748 * Update the current task's runtime statistics. 774 * Update the current task's runtime statistics.
@@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se,
2840static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 2866static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2841#endif /* CONFIG_FAIR_GROUP_SCHED */ 2867#endif /* CONFIG_FAIR_GROUP_SCHED */
2842 2868
2843static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2844
2845static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2869static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2846{ 2870{
2847 struct rq *rq = rq_of(cfs_rq); 2871 struct rq *rq = rq_of(cfs_rq);
@@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
2951 /* 2975 /*
2952 * If we got migrated (either between CPUs or between cgroups) we'll 2976 * If we got migrated (either between CPUs or between cgroups) we'll
2953 * have aged the average right before clearing @last_update_time. 2977 * have aged the average right before clearing @last_update_time.
2978 *
2979 * Or we're fresh through post_init_entity_util_avg().
2954 */ 2980 */
2955 if (se->avg.last_update_time) { 2981 if (se->avg.last_update_time) {
2956 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), 2982 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se)
3056 u64 last_update_time; 3082 u64 last_update_time;
3057 3083
3058 /* 3084 /*
3059 * Newly created task or never used group entity should not be removed 3085 * tasks cannot exit without having gone through wake_up_new_task() ->
3060 * from its (source) cfs_rq 3086 * post_init_entity_util_avg() which will have added things to the
3087 * cfs_rq, so we can remove unconditionally.
3088 *
3089 * Similarly for groups, they will have passed through
3090 * post_init_entity_util_avg() before unregister_sched_fair_group()
3091 * calls this.
3061 */ 3092 */
3062 if (se->avg.last_update_time == 0)
3063 return;
3064 3093
3065 last_update_time = cfs_rq_last_update_time(cfs_rq); 3094 last_update_time = cfs_rq_last_update_time(cfs_rq);
3066 3095