sched/fair: Rework sched_fair time accounting

Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This results in him occasionally seeing time going backwards - which crashes the scheduler ... Most of our time accounting can actually handle that except the most common one; the tick time update of sched_fair. There is a further problem with that code; previously we assumed that because we get a tick every TICK_NSEC our time delta could never exceed 32bits and math was simpler. However, ever since Frederic managed to get NO_HZ_FULL merged; this is no longer the case since now a task can run for a long time indeed without getting a tick. It only takes about ~4.2 seconds to overflow our u32 in nanoseconds. This means we not only need to better deal with time going backwards; but also means we need to be able to deal with large deltas. This patch reworks the entire code and uses mul_u64_u32_shr() as proposed by Andy a long while ago. We express our virtual time scale factor in a u32 multiplier and shift right and the 32bit mul_u64_u32_shr() implementation reduces to a single 32x32->64 multiply if the time delta is still short (common case). For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128. Reported-and-Tested-by: Christian Engelmayer <cengelma@gmx.at> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: fweisbec@gmail.com Cc: Paul Turner <pjt@google.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2013-11-18 12:27:06 -0500
committer: Ingo Molnar <mingo@kernel.org> 2013-12-11 09:52:35 -0500
commit: 9dbdb155532395ba000c5d5d187658b0e17e529f (patch)
tree: a01fb4066fbad520f364abc0bb7b68f45aab3100 /kernel
parent: be5e610c0fd6ef772cafb9e0bd4128134804aef3 (diff)
1 files changed, 64 insertions, 80 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ade1a31..9030da7bcb15 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
        update_sysctl();
 }
-#if BITS_PER_LONG == 32
+#define WMULT_CONST     (~0U)
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
 #define WMULT_SHIFT     32
-/*
+static void __update_inv_weight(struct load_weight *lw)
- * Shift right and round:
+{
- */
+        unsigned long w;
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+        if (likely(lw->inv_weight))
+                return;
+        w = scale_load_down(lw->weight);
+        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                lw->inv_weight = 1;
+        else if (unlikely(!w))
+                lw->inv_weight = WMULT_CONST;
+        else
+                lw->inv_weight = WMULT_CONST / w;
+}
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
-static unsigned long
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
 {
-        u64 tmp;
+        u64 fact = scale_load_down(weight);
+        int shift = WMULT_SHIFT;
-        /*
+        __update_inv_weight(lw);
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
+        if (unlikely(fact >> 32)) {
-                unsigned long w = scale_load_down(lw->weight);
+                while (fact >> 32) {
+                        fact >>= 1;
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                        shift--;
-                        lw->inv_weight = 1;
+                }
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
        }
-        /*
+        /* hint to use a 32x32->64 mul */
-         * Check whether we'd overflow the 64-bit multiplication:
+        fact = (u64)(u32)fact * lw->inv_weight;
-         */
-        if (unlikely(tmp > WMULT_CONST))
+        while (fact >> 32) {
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                fact >>= 1;
-                        WMULT_SHIFT/2);
+                shift--;
-        else
+        }
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+        return mul_u64_u32_shr(delta_exec, fact, shift);
 }
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
 * delta /= w
 */
-static inline unsigned long
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
 {
        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
        return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        update_load_add(&lw, se->load.weight);
                        load = &lw;
                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
+                slice = __calc_delta(slice, se->load.weight, load);
        }
        return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 /*
- * Update the current task's runtime statistics. Skip current tasks that
+ * Update the current task's runtime statistics.
- * are not in our scheduling class.
 */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
        u64 now = rq_clock_task(rq_of(cfs_rq));
-        unsigned long delta_exec;
+        u64 delta_exec;
        if (unlikely(!curr))
                return;
-        /*
+        delta_exec = now - curr->exec_start;
-         * Get the amount of time the current task was running
+        if (unlikely((s64)delta_exec <= 0))
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
                return;
-        __update_curr(cfs_rq, curr, delta_exec);
        curr->exec_start = now;
+        schedstat_set(curr->statistics.exec_max,
+                      max(delta_exec, curr->statistics.exec_max));
+        curr->sum_exec_runtime += delta_exec;
+        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        curr->vruntime += calc_delta_fair(delta_exec, curr);
+        update_min_vruntime(cfs_rq);
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        }
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
-                                     unsigned long delta_exec)
 {
        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        return rq_clock_task(rq_of(cfs_rq));
 }
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
author	Peter Zijlstra <peterz@infradead.org>	2013-11-18 12:27:06 -0500
committer	Ingo Molnar <mingo@kernel.org>	2013-12-11 09:52:35 -0500
commit	9dbdb155532395ba000c5d5d187658b0e17e529f (patch)
tree	a01fb4066fbad520f364abc0bb7b68f45aab3100 /kernel
parent	be5e610c0fd6ef772cafb9e0bd4128134804aef3 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ade1a31..9030da7bcb15 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178	update_sysctl();	178	update_sysctl();
179	}	179	}
180		180
181	#if BITS_PER_LONG == 32	181	#define WMULT_CONST (~0U)
182	# define WMULT_CONST (~0UL)
183	#else
184	# define WMULT_CONST (1UL << 32)
185	#endif
186
187	#define WMULT_SHIFT 32	182	#define WMULT_SHIFT 32
188		183
189	/*	184	static void __update_inv_weight(struct load_weight *lw)
190	* Shift right and round:	185	{
191	*/	186	unsigned long w;
192	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))	187
		188	if (likely(lw->inv_weight))
		189	return;
		190
		191	w = scale_load_down(lw->weight);
		192
		193	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
		194	lw->inv_weight = 1;
		195	else if (unlikely(!w))
		196	lw->inv_weight = WMULT_CONST;
		197	else
		198	lw->inv_weight = WMULT_CONST / w;
		199	}
193		200
194	/*	201	/*
195	* delta *= weight / lw	202	* delta_exec * weight / lw.weight
		203	* OR
		204	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
		205	*
		206	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
		207	* we're guaranteed shift stays positive because inv_weight is guaranteed to
		208	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
		209	*
		210	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
		211	* weight/lw.weight <= 1, and therefore our shift will also be positive.
196	*/	212	*/
197	static unsigned long	213	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199	struct load_weight *lw)
200	{	214	{
201	u64 tmp;	215	u64 fact = scale_load_down(weight);
		216	int shift = WMULT_SHIFT;
202		217
203	/*	218	__update_inv_weight(lw);
204	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
206	* 2^SCHED_LOAD_RESOLUTION.
207	*/
208	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209	tmp = (u64)delta_exec * scale_load_down(weight);
210	else
211	tmp = (u64)delta_exec;
212		219
213	if (!lw->inv_weight) {	220	if (unlikely(fact >> 32)) {
214	unsigned long w = scale_load_down(lw->weight);	221	while (fact >> 32) {
215		222	fact >>= 1;
216	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))	223	shift--;
217	lw->inv_weight = 1;	224	}
218	else if (unlikely(!w))
219	lw->inv_weight = WMULT_CONST;
220	else
221	lw->inv_weight = WMULT_CONST / w;
222	}	225	}
223		226
224	/*	227	/* hint to use a 32x32->64 mul */
225	* Check whether we'd overflow the 64-bit multiplication:	228	fact = (u64)(u32)fact * lw->inv_weight;
226	*/	229
227	if (unlikely(tmp > WMULT_CONST))	230	while (fact >> 32) {
228	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,	231	fact >>= 1;
229	WMULT_SHIFT/2);	232	shift--;
230	else	233	}
231	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232		234
233	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);	235	return mul_u64_u32_shr(delta_exec, fact, shift);
234	}	236	}
235		237
236		238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity se, struct sched_entity pse)
443	#endif /* CONFIG_FAIR_GROUP_SCHED */	445	#endif /* CONFIG_FAIR_GROUP_SCHED */
444		446
445	static __always_inline	447	static __always_inline
446	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);	448	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447		449
448	/**************************************************************	450	/**************************************************************
449	* Scheduling class tree data structure manipulation methods:	451	* Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612	/*	614	/*
613	* delta /= w	615	* delta /= w
614	*/	616	*/
615	static inline unsigned long	617	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616	calc_delta_fair(unsigned long delta, struct sched_entity *se)
617	{	618	{
618	if (unlikely(se->load.weight != NICE_0_LOAD))	619	if (unlikely(se->load.weight != NICE_0_LOAD))
619	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);	620	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620		621
621	return delta;	622	return delta;
622	}	623	}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
665	update_load_add(&lw, se->load.weight);	666	update_load_add(&lw, se->load.weight);
666	load = &lw;	667	load = &lw;
667	}	668	}
668	slice = calc_delta_mine(slice, se->load.weight, load);	669	slice = __calc_delta(slice, se->load.weight, load);
669	}	670	}
670	return slice;	671	return slice;
671	}	672	}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703	#endif	704	#endif
704		705
705	/*	706	/*
706	* Update the current task's runtime statistics. Skip current tasks that	707	* Update the current task's runtime statistics.
707	* are not in our scheduling class.
708	*/	708	*/
709	static inline void
710	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
711	unsigned long delta_exec)
712	{
713	unsigned long delta_exec_weighted;
714
715	schedstat_set(curr->statistics.exec_max,
716	max((u64)delta_exec, curr->statistics.exec_max));
717
718	curr->sum_exec_runtime += delta_exec;
719	schedstat_add(cfs_rq, exec_clock, delta_exec);
720	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722	curr->vruntime += delta_exec_weighted;
723	update_min_vruntime(cfs_rq);
724	}
725
726	static void update_curr(struct cfs_rq *cfs_rq)	709	static void update_curr(struct cfs_rq *cfs_rq)
727	{	710	{
728	struct sched_entity *curr = cfs_rq->curr;	711	struct sched_entity *curr = cfs_rq->curr;
729	u64 now = rq_clock_task(rq_of(cfs_rq));	712	u64 now = rq_clock_task(rq_of(cfs_rq));
730	unsigned long delta_exec;	713	u64 delta_exec;
731		714
732	if (unlikely(!curr))	715	if (unlikely(!curr))
733	return;	716	return;
734		717
735	/*	718	delta_exec = now - curr->exec_start;
736	* Get the amount of time the current task was running	719	if (unlikely((s64)delta_exec <= 0))
737	* since the last time we changed load (this cannot
738	* overflow on 32 bits):
739	*/
740	delta_exec = (unsigned long)(now - curr->exec_start);
741	if (!delta_exec)
742	return;	720	return;
743		721
744	__update_curr(cfs_rq, curr, delta_exec);
745	curr->exec_start = now;	722	curr->exec_start = now;
746		723
		724	schedstat_set(curr->statistics.exec_max,
		725	max(delta_exec, curr->statistics.exec_max));
		726
		727	curr->sum_exec_runtime += delta_exec;
		728	schedstat_add(cfs_rq, exec_clock, delta_exec);
		729
		730	curr->vruntime += calc_delta_fair(delta_exec, curr);
		731	update_min_vruntime(cfs_rq);
		732
747	if (entity_is_task(curr)) {	733	if (entity_is_task(curr)) {
748	struct task_struct *curtask = task_of(curr);	734	struct task_struct *curtask = task_of(curr);
749		735
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015	}	3001	}
3016	}	3002	}
3017		3003
3018	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	3004	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019	unsigned long delta_exec)
3020	{	3005	{
3021	/* dock delta_exec before expiring quota (as it could span periods) */	3006	/* dock delta_exec before expiring quota (as it could span periods) */
3022	cfs_rq->runtime_remaining -= delta_exec;	3007	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034	}	3019	}
3035		3020
3036	static __always_inline	3021	static __always_inline
3037	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)	3022	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038	{	3023	{
3039	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)	3024	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
3040	return;	3025	return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574	return rq_clock_task(rq_of(cfs_rq));	3559	return rq_clock_task(rq_of(cfs_rq));
3575	}	3560	}
3576		3561
3577	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	3562	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578	unsigned long delta_exec) {}
3579	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3563	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}	3564	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3565	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}