sched/cputime: Guarantee stime + utime == rtime

While the current code guarantees monotonicity for stime and utime independently of one another, it does not guarantee that the sum of both is equal to the total time we started out with. This confuses things (and peoples) who look at this sum, like top, and will report >100% usage followed by a matching period of 0%. Rework the code to provide both individual monotonicity and a coherent sum. Suggested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Reported-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Tested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jason.low2@hp.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2015-06-30 05:30:54 -0400
committer: Ingo Molnar <mingo@kernel.org> 2015-08-03 06:21:21 -0400
commit: 9d7fb04276481c59610983362d8e023d262b58ca (patch)
tree: 2f6a6d497d0e088bd984876a34845c64afdffdbe /kernel/sched
parent: 781b0203423c228b100aaaf169c77b2b556f8a49 (diff)
1 files changed, 60 insertions, 41 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
 }
 /*
- * Atomically advance counter to the new value. Interrupts, vcpu
+ * Adjust tick based cputime random precision against scheduler runtime
- * scheduling, and scaling inaccuracies can cause cputime_advance
+ * accounting.
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
 *
- * Normally a caller will only go through this loop once, or not
+ * Tick based cputime accounting depend on random scheduling timeslices of a
- * at all in case a previous caller updated counter the same jiffy.
+ * task to be interrupted or not by the timer.  Depending on these
- */
+ * circumstances, the number of these interrupts may be over or
-static void cputime_advance(cputime_t *counter, cputime_t new)
+ * under-optimistic, matching the real user and system cputime with a variable
-{
+ * precision.
-        cputime_t old;
+ *
+ * Fix this by scaling these tick based values against the total runtime
-        while (new > (old = READ_ONCE(*counter)))
+ * accounted by the CFS scheduler.
-                cmpxchg_cputime(counter, old, new);
+ *
-}
+ * This code provides the following guarantees:
+ *
-/*
+ *   stime + utime == rtime
- * Adjust tick based cputime random precision against scheduler
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
- * runtime accounting.
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
 */
 static void cputime_adjust(struct task_cputime *curr,
-                           struct cputime *prev,
+                           struct prev_cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, stime, utime;
+        unsigned long flags;
-        /*
+        /* Serialize concurrent callers such that we can honour our guarantees */
-         * Tick based cputime accounting depend on random scheduling
+        raw_spin_lock_irqsave(&prev->lock, flags);
-         * timeslices of a task to be interrupted or not by the timer.
-         * Depending on these circumstances, the number of these interrupts
-         * may be over or under-optimistic, matching the real user and system
-         * cputime with a variable precision.
-         *
-         * Fix this by scaling these tick based values against the total
-         * runtime accounted by the CFS scheduler.
-         */
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
        /*
-         * Update userspace visible utime/stime values only if actual execution
+         * This is possible under two circumstances:
-         * time is bigger than already exported. Note that can happen, that we
+         *  - rtime isn't monotonic after all (a bug);
-         * provided bigger values due to scaling inaccuracy on big numbers.
+         *  - we got reordered by the lock.
+         *
+         * In both cases this acts as a filter such that the rest of the code
+         * can assume it is monotonic regardless of anything else.
         */
        if (prev->stime + prev->utime >= rtime)
                goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
        if (utime == 0) {
                stime = rtime;
-        } else if (stime == 0) {
+                goto update;
-                utime = rtime;
+        }
-        } else {
-                cputime_t total = stime + utime;
-                stime = scale_stime((__force u64)stime,
+        if (stime == 0) {
-                                    (__force u64)rtime, (__force u64)total);
+                utime = rtime;
-                utime = rtime - stime;
+                goto update;
        }
-        cputime_advance(&prev->stime, stime);
+        stime = scale_stime((__force u64)stime, (__force u64)rtime,
-        cputime_advance(&prev->utime, utime);
+                            (__force u64)(stime + utime));
+        /*
+         * Make sure stime doesn't go backwards; this preserves monotonicity
+         * for utime because rtime is monotonic.
+         *
+         *  utime_i+1 = rtime_i+1 - stime_i
+         *            = rtime_i+1 - (rtime_i - utime_i)
+         *            = (rtime_i+1 - rtime_i) + utime_i
+         *            >= utime_i
+         */
+        if (stime < prev->stime)
+                stime = prev->stime;
+        utime = rtime - stime;
+        /*
+         * Make sure utime doesn't go backwards; this still preserves
+         * monotonicity for stime, analogous argument to above.
+         */
+        if (utime < prev->utime) {
+                utime = prev->utime;
+                stime = rtime - utime;
+        }
+update:
+        prev->stime = stime;
+        prev->utime = utime;
 out:
        *ut = prev->utime;
        *st = prev->stime;
+        raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
author	Peter Zijlstra <peterz@infradead.org>	2015-06-30 05:30:54 -0400
committer	Ingo Molnar <mingo@kernel.org>	2015-08-03 06:21:21 -0400
commit	9d7fb04276481c59610983362d8e023d262b58ca (patch)
tree	2f6a6d497d0e088bd984876a34845c64afdffdbe /kernel/sched
parent	781b0203423c228b100aaaf169c77b2b556f8a49 (diff)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..8cbc3db671df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
555	}	555	}
556		556
557	/*	557	/*
558	* Atomically advance counter to the new value. Interrupts, vcpu	558	* Adjust tick based cputime random precision against scheduler runtime
559	* scheduling, and scaling inaccuracies can cause cputime_advance	559	* accounting.
560	* to be occasionally called with a new value smaller than counter.
561	* Let's enforce atomicity.
562	*	560	*
563	* Normally a caller will only go through this loop once, or not	561	* Tick based cputime accounting depend on random scheduling timeslices of a
564	* at all in case a previous caller updated counter the same jiffy.	562	* task to be interrupted or not by the timer. Depending on these
565	*/	563	* circumstances, the number of these interrupts may be over or
566	static void cputime_advance(cputime_t *counter, cputime_t new)	564	* under-optimistic, matching the real user and system cputime with a variable
567	{	565	* precision.
568	cputime_t old;	566	*
569		567	* Fix this by scaling these tick based values against the total runtime
570	while (new > (old = READ_ONCE(*counter)))	568	* accounted by the CFS scheduler.
571	cmpxchg_cputime(counter, old, new);	569	*
572	}	570	* This code provides the following guarantees:
573		571	*
574	/*	572	* stime + utime == rtime
575	* Adjust tick based cputime random precision against scheduler	573	* stime_i+1 >= stime_i, utime_i+1 >= utime_i
576	* runtime accounting.	574	*
		575	* Assuming that rtime_i+1 >= rtime_i.
577	*/	576	*/
578	static void cputime_adjust(struct task_cputime *curr,	577	static void cputime_adjust(struct task_cputime *curr,
579	struct cputime *prev,	578	struct prev_cputime *prev,
580	cputime_t ut, cputime_t st)	579	cputime_t ut, cputime_t st)
581	{	580	{
582	cputime_t rtime, stime, utime;	581	cputime_t rtime, stime, utime;
		582	unsigned long flags;
583		583
584	/*	584	/* Serialize concurrent callers such that we can honour our guarantees */
585	* Tick based cputime accounting depend on random scheduling	585	raw_spin_lock_irqsave(&prev->lock, flags);
586	* timeslices of a task to be interrupted or not by the timer.
587	* Depending on these circumstances, the number of these interrupts
588	* may be over or under-optimistic, matching the real user and system
589	* cputime with a variable precision.
590	*
591	* Fix this by scaling these tick based values against the total
592	* runtime accounted by the CFS scheduler.
593	*/
594	rtime = nsecs_to_cputime(curr->sum_exec_runtime);	586	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
595		587
596	/*	588	/*
597	* Update userspace visible utime/stime values only if actual execution	589	* This is possible under two circumstances:
598	* time is bigger than already exported. Note that can happen, that we	590	* - rtime isn't monotonic after all (a bug);
599	* provided bigger values due to scaling inaccuracy on big numbers.	591	* - we got reordered by the lock.
		592	*
		593	* In both cases this acts as a filter such that the rest of the code
		594	* can assume it is monotonic regardless of anything else.
600	*/	595	*/
601	if (prev->stime + prev->utime >= rtime)	596	if (prev->stime + prev->utime >= rtime)
602	goto out;	597	goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
606		601
607	if (utime == 0) {	602	if (utime == 0) {
608	stime = rtime;	603	stime = rtime;
609	} else if (stime == 0) {	604	goto update;
610	utime = rtime;	605	}
611	} else {
612	cputime_t total = stime + utime;
613		606
614	stime = scale_stime((__force u64)stime,	607	if (stime == 0) {
615	(__force u64)rtime, (__force u64)total);	608	utime = rtime;
616	utime = rtime - stime;	609	goto update;
617	}	610	}
618		611
619	cputime_advance(&prev->stime, stime);	612	stime = scale_stime((__force u64)stime, (__force u64)rtime,
620	cputime_advance(&prev->utime, utime);	613	(__force u64)(stime + utime));
		614
		615	/*
		616	* Make sure stime doesn't go backwards; this preserves monotonicity
		617	* for utime because rtime is monotonic.
		618	*
		619	* utime_i+1 = rtime_i+1 - stime_i
		620	* = rtime_i+1 - (rtime_i - utime_i)
		621	* = (rtime_i+1 - rtime_i) + utime_i
		622	* >= utime_i
		623	*/
		624	if (stime < prev->stime)
		625	stime = prev->stime;
		626	utime = rtime - stime;
		627
		628	/*
		629	* Make sure utime doesn't go backwards; this still preserves
		630	* monotonicity for stime, analogous argument to above.
		631	*/
		632	if (utime < prev->utime) {
		633	utime = prev->utime;
		634	stime = rtime - utime;
		635	}
621		636
		637	update:
		638	prev->stime = stime;
		639	prev->utime = utime;
622	out:	640	out:
623	*ut = prev->utime;	641	*ut = prev->utime;
624	*st = prev->stime;	642	*st = prev->stime;
		643	raw_spin_unlock_irqrestore(&prev->lock, flags);
625	}	644	}
626		645
627	void task_cputime_adjusted(struct task_struct p, cputime_t ut, cputime_t *st)	646	void task_cputime_adjusted(struct task_struct p, cputime_t ut, cputime_t *st)