aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2016-06-16 04:50:40 -0400
committerIngo Molnar <mingo@kernel.org>2016-06-20 05:29:09 -0400
commit8974189222159154c55f24ddad33e3613960521a (patch)
tree98bbcf7ab79eff29656f22b950f6bf5f75549240 /kernel/sched
parent57675cb976eff977aefb428e68e4e0236d48a9ff (diff)
sched/fair: Fix cfs_rq avg tracking underflow
As per commit: b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization") > the code generated from update_cfs_rq_load_avg(): > > if (atomic_long_read(&cfs_rq->removed_load_avg)) { > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > sa->load_avg = max_t(long, sa->load_avg - r, 0); > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); > removed_load = 1; > } > > turns into: > > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax > ffffffff8108706b: 48 85 c0 test %rax,%rax > ffffffff8108706e: 74 40 je ffffffff810870b0 <update_blocked_averages+0xc0> > ffffffff81087070: 4c 89 f8 mov %r15,%rax > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx > > Which you'll note ends up with sa->load_avg -= r in memory at > ffffffff8108707a. So I _should_ have looked at other unserialized users of ->load_avg, but alas. Luckily nikbor reported a similar /0 from task_h_load() which instantly triggered recollection of this here problem. Aside from the intermediate value hitting memory and causing problems, there's another problem: the underflow detection relies on the signed bit. This reduces the effective width of the variables, IOW its effectively the same as having these variables be of signed type. This patch changes to a different means of unsigned underflow detection to not rely on the signed bit. This allows the variables to use the 'full' unsigned range. And it does so with explicit LOAD - STORE to ensure any intermediate value will never be visible in memory, allowing these unserialized loads. Note: GCC generates crap code for this, might warrant a look later. Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; maybe we should do clamping on add too. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Yuyang Du <yuyang.du@intel.com> Cc: bsegall@google.com Cc: kernel@kyup.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c33
1 files changed, 25 insertions, 8 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2348deab7a3..2ae68f0e3bf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2904 } 2904 }
2905} 2905}
2906 2906
2907/*
2908 * Unsigned subtract and clamp on underflow.
2909 *
2910 * Explicitly do a load-store to ensure the intermediate value never hits
2911 * memory. This allows lockless observations without ever seeing the negative
2912 * values.
2913 */
2914#define sub_positive(_ptr, _val) do { \
2915 typeof(_ptr) ptr = (_ptr); \
2916 typeof(*ptr) val = (_val); \
2917 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2918 res = var - val; \
2919 if (res > var) \
2920 res = 0; \
2921 WRITE_ONCE(*ptr, res); \
2922} while (0)
2923
2907/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2924/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2908static inline int 2925static inline int
2909update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 2926update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2913 2930
2914 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 2931 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2915 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 2932 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2916 sa->load_avg = max_t(long, sa->load_avg - r, 0); 2933 sub_positive(&sa->load_avg, r);
2917 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); 2934 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2918 removed_load = 1; 2935 removed_load = 1;
2919 } 2936 }
2920 2937
2921 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2938 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2922 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2939 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2923 sa->util_avg = max_t(long, sa->util_avg - r, 0); 2940 sub_positive(&sa->util_avg, r);
2924 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); 2941 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2925 removed_util = 1; 2942 removed_util = 1;
2926 } 2943 }
2927 2944
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
2994 &se->avg, se->on_rq * scale_load_down(se->load.weight), 3011 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2995 cfs_rq->curr == se, NULL); 3012 cfs_rq->curr == se, NULL);
2996 3013
2997 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); 3014 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2998 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); 3015 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
2999 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); 3016 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3000 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); 3017 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3001 3018
3002 cfs_rq_util_change(cfs_rq); 3019 cfs_rq_util_change(cfs_rq);
3003} 3020}