diff options
author | Peter Zijlstra <peterz@infradead.org> | 2013-11-18 12:27:06 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-12-11 09:52:35 -0500 |
commit | 9dbdb155532395ba000c5d5d187658b0e17e529f (patch) | |
tree | a01fb4066fbad520f364abc0bb7b68f45aab3100 /kernel | |
parent | be5e610c0fd6ef772cafb9e0bd4128134804aef3 (diff) |
sched/fair: Rework sched_fair time accounting
Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This
results in him occasionally seeing time going backwards - which
crashes the scheduler ...
Most of our time accounting can actually handle that except the most
common one; the tick time update of sched_fair.
There is a further problem with that code; previously we assumed that
because we get a tick every TICK_NSEC our time delta could never
exceed 32bits and math was simpler.
However, ever since Frederic managed to get NO_HZ_FULL merged; this is
no longer the case since now a task can run for a long time indeed
without getting a tick. It only takes about ~4.2 seconds to overflow
our u32 in nanoseconds.
This means we not only need to better deal with time going backwards;
but also means we need to be able to deal with large deltas.
This patch reworks the entire code and uses mul_u64_u32_shr() as
proposed by Andy a long while ago.
We express our virtual time scale factor in a u32 multiplier and shift
right and the 32bit mul_u64_u32_shr() implementation reduces to a
single 32x32->64 multiply if the time delta is still short (common
case).
For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128.
Reported-and-Tested-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: fweisbec@gmail.com
Cc: Paul Turner <pjt@google.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/fair.c | 144 |
1 files changed, 64 insertions, 80 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ade1a31..9030da7bcb15 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -178,59 +178,61 @@ void sched_init_granularity(void) | |||
178 | update_sysctl(); | 178 | update_sysctl(); |
179 | } | 179 | } |
180 | 180 | ||
181 | #if BITS_PER_LONG == 32 | 181 | #define WMULT_CONST (~0U) |
182 | # define WMULT_CONST (~0UL) | ||
183 | #else | ||
184 | # define WMULT_CONST (1UL << 32) | ||
185 | #endif | ||
186 | |||
187 | #define WMULT_SHIFT 32 | 182 | #define WMULT_SHIFT 32 |
188 | 183 | ||
189 | /* | 184 | static void __update_inv_weight(struct load_weight *lw) |
190 | * Shift right and round: | 185 | { |
191 | */ | 186 | unsigned long w; |
192 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 187 | |
188 | if (likely(lw->inv_weight)) | ||
189 | return; | ||
190 | |||
191 | w = scale_load_down(lw->weight); | ||
192 | |||
193 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
194 | lw->inv_weight = 1; | ||
195 | else if (unlikely(!w)) | ||
196 | lw->inv_weight = WMULT_CONST; | ||
197 | else | ||
198 | lw->inv_weight = WMULT_CONST / w; | ||
199 | } | ||
193 | 200 | ||
194 | /* | 201 | /* |
195 | * delta *= weight / lw | 202 | * delta_exec * weight / lw.weight |
203 | * OR | ||
204 | * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT | ||
205 | * | ||
206 | * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case | ||
207 | * we're guaranteed shift stays positive because inv_weight is guaranteed to | ||
208 | * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. | ||
209 | * | ||
210 | * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus | ||
211 | * weight/lw.weight <= 1, and therefore our shift will also be positive. | ||
196 | */ | 212 | */ |
197 | static unsigned long | 213 | static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) |
198 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
199 | struct load_weight *lw) | ||
200 | { | 214 | { |
201 | u64 tmp; | 215 | u64 fact = scale_load_down(weight); |
216 | int shift = WMULT_SHIFT; | ||
202 | 217 | ||
203 | /* | 218 | __update_inv_weight(lw); |
204 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
205 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
206 | * 2^SCHED_LOAD_RESOLUTION. | ||
207 | */ | ||
208 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
209 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
210 | else | ||
211 | tmp = (u64)delta_exec; | ||
212 | 219 | ||
213 | if (!lw->inv_weight) { | 220 | if (unlikely(fact >> 32)) { |
214 | unsigned long w = scale_load_down(lw->weight); | 221 | while (fact >> 32) { |
215 | 222 | fact >>= 1; | |
216 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | 223 | shift--; |
217 | lw->inv_weight = 1; | 224 | } |
218 | else if (unlikely(!w)) | ||
219 | lw->inv_weight = WMULT_CONST; | ||
220 | else | ||
221 | lw->inv_weight = WMULT_CONST / w; | ||
222 | } | 225 | } |
223 | 226 | ||
224 | /* | 227 | /* hint to use a 32x32->64 mul */ |
225 | * Check whether we'd overflow the 64-bit multiplication: | 228 | fact = (u64)(u32)fact * lw->inv_weight; |
226 | */ | 229 | |
227 | if (unlikely(tmp > WMULT_CONST)) | 230 | while (fact >> 32) { |
228 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | 231 | fact >>= 1; |
229 | WMULT_SHIFT/2); | 232 | shift--; |
230 | else | 233 | } |
231 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
232 | 234 | ||
233 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 235 | return mul_u64_u32_shr(delta_exec, fact, shift); |
234 | } | 236 | } |
235 | 237 | ||
236 | 238 | ||
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 445 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
444 | 446 | ||
445 | static __always_inline | 447 | static __always_inline |
446 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); | 448 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); |
447 | 449 | ||
448 | /************************************************************** | 450 | /************************************************************** |
449 | * Scheduling class tree data structure manipulation methods: | 451 | * Scheduling class tree data structure manipulation methods: |
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
612 | /* | 614 | /* |
613 | * delta /= w | 615 | * delta /= w |
614 | */ | 616 | */ |
615 | static inline unsigned long | 617 | static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) |
616 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
617 | { | 618 | { |
618 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 619 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
619 | delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); | 620 | delta = __calc_delta(delta, NICE_0_LOAD, &se->load); |
620 | 621 | ||
621 | return delta; | 622 | return delta; |
622 | } | 623 | } |
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
665 | update_load_add(&lw, se->load.weight); | 666 | update_load_add(&lw, se->load.weight); |
666 | load = &lw; | 667 | load = &lw; |
667 | } | 668 | } |
668 | slice = calc_delta_mine(slice, se->load.weight, load); | 669 | slice = __calc_delta(slice, se->load.weight, load); |
669 | } | 670 | } |
670 | return slice; | 671 | return slice; |
671 | } | 672 | } |
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p) | |||
703 | #endif | 704 | #endif |
704 | 705 | ||
705 | /* | 706 | /* |
706 | * Update the current task's runtime statistics. Skip current tasks that | 707 | * Update the current task's runtime statistics. |
707 | * are not in our scheduling class. | ||
708 | */ | 708 | */ |
709 | static inline void | ||
710 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | ||
711 | unsigned long delta_exec) | ||
712 | { | ||
713 | unsigned long delta_exec_weighted; | ||
714 | |||
715 | schedstat_set(curr->statistics.exec_max, | ||
716 | max((u64)delta_exec, curr->statistics.exec_max)); | ||
717 | |||
718 | curr->sum_exec_runtime += delta_exec; | ||
719 | schedstat_add(cfs_rq, exec_clock, delta_exec); | ||
720 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); | ||
721 | |||
722 | curr->vruntime += delta_exec_weighted; | ||
723 | update_min_vruntime(cfs_rq); | ||
724 | } | ||
725 | |||
726 | static void update_curr(struct cfs_rq *cfs_rq) | 709 | static void update_curr(struct cfs_rq *cfs_rq) |
727 | { | 710 | { |
728 | struct sched_entity *curr = cfs_rq->curr; | 711 | struct sched_entity *curr = cfs_rq->curr; |
729 | u64 now = rq_clock_task(rq_of(cfs_rq)); | 712 | u64 now = rq_clock_task(rq_of(cfs_rq)); |
730 | unsigned long delta_exec; | 713 | u64 delta_exec; |
731 | 714 | ||
732 | if (unlikely(!curr)) | 715 | if (unlikely(!curr)) |
733 | return; | 716 | return; |
734 | 717 | ||
735 | /* | 718 | delta_exec = now - curr->exec_start; |
736 | * Get the amount of time the current task was running | 719 | if (unlikely((s64)delta_exec <= 0)) |
737 | * since the last time we changed load (this cannot | ||
738 | * overflow on 32 bits): | ||
739 | */ | ||
740 | delta_exec = (unsigned long)(now - curr->exec_start); | ||
741 | if (!delta_exec) | ||
742 | return; | 720 | return; |
743 | 721 | ||
744 | __update_curr(cfs_rq, curr, delta_exec); | ||
745 | curr->exec_start = now; | 722 | curr->exec_start = now; |
746 | 723 | ||
724 | schedstat_set(curr->statistics.exec_max, | ||
725 | max(delta_exec, curr->statistics.exec_max)); | ||
726 | |||
727 | curr->sum_exec_runtime += delta_exec; | ||
728 | schedstat_add(cfs_rq, exec_clock, delta_exec); | ||
729 | |||
730 | curr->vruntime += calc_delta_fair(delta_exec, curr); | ||
731 | update_min_vruntime(cfs_rq); | ||
732 | |||
747 | if (entity_is_task(curr)) { | 733 | if (entity_is_task(curr)) { |
748 | struct task_struct *curtask = task_of(curr); | 734 | struct task_struct *curtask = task_of(curr); |
749 | 735 | ||
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
3015 | } | 3001 | } |
3016 | } | 3002 | } |
3017 | 3003 | ||
3018 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 3004 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) |
3019 | unsigned long delta_exec) | ||
3020 | { | 3005 | { |
3021 | /* dock delta_exec before expiring quota (as it could span periods) */ | 3006 | /* dock delta_exec before expiring quota (as it could span periods) */ |
3022 | cfs_rq->runtime_remaining -= delta_exec; | 3007 | cfs_rq->runtime_remaining -= delta_exec; |
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
3034 | } | 3019 | } |
3035 | 3020 | ||
3036 | static __always_inline | 3021 | static __always_inline |
3037 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) | 3022 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) |
3038 | { | 3023 | { |
3039 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) | 3024 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
3040 | return; | 3025 | return; |
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
3574 | return rq_clock_task(rq_of(cfs_rq)); | 3559 | return rq_clock_task(rq_of(cfs_rq)); |
3575 | } | 3560 | } |
3576 | 3561 | ||
3577 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 3562 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
3578 | unsigned long delta_exec) {} | ||
3579 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3563 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
3580 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 3564 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
3581 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3565 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |