diff options
author | Peter Zijlstra <peterz@infradead.org> | 2016-06-09 09:07:50 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-06-14 04:58:34 -0400 |
commit | b7fa30c9cc48c4f55663420472505d3b4f6e1705 (patch) | |
tree | 38f0c717a7a3044c2c1d285f06bd3d136cf746c4 | |
parent | db06d759d6cf903aeda8c107fd3abd366dd80200 (diff) |
sched/fair: Fix post_init_entity_util_avg() serialization
Chris Wilson reported a divide by 0 at:
post_init_entity_util_avg():
> 725 if (cfs_rq->avg.util_avg != 0) {
> 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
> -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
> 728
> 729 if (sa->util_avg > cap)
> 730 sa->util_avg = cap;
> 731 } else {
Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible:
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
removed_load = 1;
}
turns into:
ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax
ffffffff8108706b: 48 85 c0 test %rax,%rax
ffffffff8108706e: 74 40 je ffffffff810870b0
ffffffff81087070: 4c 89 f8 mov %r15,%rax
ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13)
ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13)
ffffffff8108707e: 4c 89 f9 mov %r15,%rcx
ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx
ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13)
ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx
Which you'll note ends up with 'sa->load_avg - r' in memory at
ffffffff8108707a.
By calling post_init_entity_util_avg() under rq->lock we're sure to be
fully serialized against PELT updates and cannot observe intermediate
state like this.
Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value")
Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 8 |
2 files changed, 8 insertions, 3 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..13d0896aff87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p) | |||
2535 | */ | 2535 | */ |
2536 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2536 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2537 | #endif | 2537 | #endif |
2538 | /* Post initialize new task's util average when its cfs_rq is set */ | 2538 | rq = __task_rq_lock(p, &rf); |
2539 | post_init_entity_util_avg(&p->se); | 2539 | post_init_entity_util_avg(&p->se); |
2540 | 2540 | ||
2541 | rq = __task_rq_lock(p, &rf); | ||
2542 | activate_task(rq, p, 0); | 2541 | activate_task(rq, p, 0); |
2543 | p->on_rq = TASK_ON_RQ_QUEUED; | 2542 | p->on_rq = TASK_ON_RQ_QUEUED; |
2544 | trace_sched_wakeup_new(p); | 2543 | trace_sched_wakeup_new(p); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4e33ad12bb68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg) | |||
8496 | 8496 | ||
8497 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8497 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8498 | { | 8498 | { |
8499 | struct cfs_rq *cfs_rq; | ||
8500 | struct sched_entity *se; | 8499 | struct sched_entity *se; |
8500 | struct cfs_rq *cfs_rq; | ||
8501 | struct rq *rq; | ||
8501 | int i; | 8502 | int i; |
8502 | 8503 | ||
8503 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8504 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8512 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | 8513 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); |
8513 | 8514 | ||
8514 | for_each_possible_cpu(i) { | 8515 | for_each_possible_cpu(i) { |
8516 | rq = cpu_rq(i); | ||
8517 | |||
8515 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8518 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8516 | GFP_KERNEL, cpu_to_node(i)); | 8519 | GFP_KERNEL, cpu_to_node(i)); |
8517 | if (!cfs_rq) | 8520 | if (!cfs_rq) |
@@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8525 | init_cfs_rq(cfs_rq); | 8528 | init_cfs_rq(cfs_rq); |
8526 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8529 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8527 | init_entity_runnable_average(se); | 8530 | init_entity_runnable_average(se); |
8531 | |||
8532 | raw_spin_lock_irq(&rq->lock); | ||
8528 | post_init_entity_util_avg(se); | 8533 | post_init_entity_util_avg(se); |
8534 | raw_spin_unlock_irq(&rq->lock); | ||
8529 | } | 8535 | } |
8530 | 8536 | ||
8531 | return 1; | 8537 | return 1; |