sched: Lower chances of cputime scaling overflow

Some users have reported that after running a process with hundreds of threads on intensive CPU-bound loads, the cputime of the group started to freeze after a few days. This is due to how we scale the tick-based cputime against the scheduler precise execution time value. We add the values of all threads in the group and we multiply that against the sum of the scheduler exec runtime of the whole group. This easily overflows after a few days/weeks of execution. A proposed solution to solve this was to compute that multiplication on stime instead of utime: 62188451f0d63add7ad0cd2a1ae269d600c1663d ("cputime: Avoid multiplication overflow on utime scaling") The rationale behind that was that it's easy for a thread to spend most of its time in userspace under intensive CPU-bound workload but it's much harder to do CPU-bound intensive long run in the kernel. This postulate got defeated when a user recently reported he was still seeing cputime freezes after the above patch. The workload that triggers this issue relates to intensive networking workloads where most of the cputime is consumed in the kernel. To reduce much more the opportunities for multiplication overflow, lets reduce the multiplication factors to the remainders of the division between sched exec runtime and cputime. Assuming the difference between these shouldn't ever be that large, it could work on many situations. This gets the same results as in the upstream scaling code except for a small difference: the upstream code always rounds the results to the nearest integer not greater to what would be the precise result. The new code rounds to the nearest integer either greater or not greater. In practice this difference probably shouldn't matter but it's worth mentioning. If this solution appears not to be enough in the end, we'll need to partly revert back to the behaviour prior to commit 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 ("sched, cputime: Introduce thread_group_times()") Back then, the scaling was done on exit() time before adding the cputime of an exiting thread to the signal struct. And then we'll need to scale one-by-one the live threads cputime in thread_group_cputime(). The drawback may be a slightly slower code on exit time. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org>
author: Frederic Weisbecker <fweisbec@gmail.com> 2013-02-20 12:54:55 -0500
committer: Frederic Weisbecker <fweisbec@gmail.com> 2013-03-13 13:18:14 -0400
commit: d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 (patch)
tree: 067dbf815802efc7f36ac0d82a5027b8bfd97ed6 /kernel/sched
parent: f792685006274a850e6cc0ea9ade275ccdfc90bc (diff)
1 files changed, 34 insertions, 12 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 024fe1998ad5..699d59756ece 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -521,18 +521,36 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
+/*
+ * Perform (stime * rtime) / total with reduced chances
+ * of multiplication overflows by using smaller factors
+ * like quotient and remainders of divisions between
+ * rtime and total.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 {
-        u64 temp = (__force u64) rtime;
+        u64 rem, res, scaled;
-        temp *= (__force u64) stime;
+        if (rtime >= total) {
+                /*
-        if (sizeof(cputime_t) == 4)
+                 * Scale up to rtime / total then add
-                temp = div_u64(temp, (__force u32) total);
+                 * the remainder scaled to stime / total.
-        else
+                 */
-                temp = div64_u64(temp, (__force u64) total);
+                res = div64_u64_rem(rtime, total, &rem);
+                scaled = stime * res;
+                scaled += div64_u64(stime * rem, total);
+        } else {
+                /*
+                 * Same in reverse: scale down to total / rtime
+                 * then substract that result scaled to
+                 * to the remaining part.
+                 */
+                res = div64_u64_rem(total, rtime, &rem);
+                scaled = div64_u64(stime, res);
+                scaled -= div64_u64(scaled * rem, total);
+        }
-        return (__force cputime_t) temp;
+        return (__force cputime_t) scaled;
 }
 /*
@@ -566,10 +584,14 @@ static void cputime_adjust(struct task_cputime *curr,
         */
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-        if (total)
+        if (!rtime) {
-                stime = scale_stime(stime, rtime, total);
+                stime = 0;
-        else
+        } else if (!total) {
                stime = rtime;
+        } else {
+                stime = scale_stime((__force u64)stime,
+                                    (__force u64)rtime, (__force u64)total);
+        }
        /*
         * If the tick based count grows faster than the scheduler one,
author	Frederic Weisbecker <fweisbec@gmail.com>	2013-02-20 12:54:55 -0500
committer	Frederic Weisbecker <fweisbec@gmail.com>	2013-03-13 13:18:14 -0400
commit	d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 (patch)
tree	067dbf815802efc7f36ac0d82a5027b8bfd97ed6 /kernel/sched
parent	f792685006274a850e6cc0ea9ade275ccdfc90bc (diff)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 024fe1998ad5..699d59756ece 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c
@@ -521,18 +521,36 @@ void account_idle_ticks(unsigned long ticks)
521	account_idle_time(jiffies_to_cputime(ticks));	521	account_idle_time(jiffies_to_cputime(ticks));
522	}	522	}
523		523
524	static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)	524	/*
		525	* Perform (stime * rtime) / total with reduced chances
		526	* of multiplication overflows by using smaller factors
		527	* like quotient and remainders of divisions between
		528	* rtime and total.
		529	*/
		530	static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
525	{	531	{
526	u64 temp = (__force u64) rtime;	532	u64 rem, res, scaled;
527		533
528	temp *= (__force u64) stime;	534	if (rtime >= total) {
529		535	/*
530	if (sizeof(cputime_t) == 4)	536	* Scale up to rtime / total then add
531	temp = div_u64(temp, (__force u32) total);	537	* the remainder scaled to stime / total.
532	else	538	*/
533	temp = div64_u64(temp, (__force u64) total);	539	res = div64_u64_rem(rtime, total, &rem);
		540	scaled = stime * res;
		541	scaled += div64_u64(stime * rem, total);
		542	} else {
		543	/*
		544	* Same in reverse: scale down to total / rtime
		545	* then substract that result scaled to
		546	* to the remaining part.
		547	*/
		548	res = div64_u64_rem(total, rtime, &rem);
		549	scaled = div64_u64(stime, res);
		550	scaled -= div64_u64(scaled * rem, total);
		551	}
534		552
535	return (__force cputime_t) temp;	553	return (__force cputime_t) scaled;
536	}	554	}
537		555
538	/*	556	/*
@@ -566,10 +584,14 @@ static void cputime_adjust(struct task_cputime *curr,
566	*/	584	*/
567	rtime = nsecs_to_cputime(curr->sum_exec_runtime);	585	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
568		586
569	if (total)	587	if (!rtime) {
570	stime = scale_stime(stime, rtime, total);	588	stime = 0;
571	else	589	} else if (!total) {
572	stime = rtime;	590	stime = rtime;
		591	} else {
		592	stime = scale_stime((__force u64)stime,
		593	(__force u64)rtime, (__force u64)total);
		594	}
573		595
574	/*	596	/*
575	* If the tick based count grows faster than the scheduler one,	597	* If the tick based count grows faster than the scheduler one,