aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2015-06-30 05:30:54 -0400
committerIngo Molnar <mingo@kernel.org>2015-08-03 06:21:21 -0400
commit9d7fb04276481c59610983362d8e023d262b58ca (patch)
tree2f6a6d497d0e088bd984876a34845c64afdffdbe /kernel/sched
parent781b0203423c228b100aaaf169c77b2b556f8a49 (diff)
sched/cputime: Guarantee stime + utime == rtime
While the current code guarantees monotonicity for stime and utime independently of one another, it does not guarantee that the sum of both is equal to the total time we started out with. This confuses things (and peoples) who look at this sum, like top, and will report >100% usage followed by a matching period of 0%. Rework the code to provide both individual monotonicity and a coherent sum. Suggested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Reported-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Tested-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jason.low2@hp.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/cputime.c101
1 files changed, 60 insertions, 41 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
555} 555}
556 556
557/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu 558 * Adjust tick based cputime random precision against scheduler runtime
559 * scheduling, and scaling inaccuracies can cause cputime_advance 559 * accounting.
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 * 560 *
563 * Normally a caller will only go through this loop once, or not 561 * Tick based cputime accounting depend on random scheduling timeslices of a
564 * at all in case a previous caller updated counter the same jiffy. 562 * task to be interrupted or not by the timer. Depending on these
565 */ 563 * circumstances, the number of these interrupts may be over or
566static void cputime_advance(cputime_t *counter, cputime_t new) 564 * under-optimistic, matching the real user and system cputime with a variable
567{ 565 * precision.
568 cputime_t old; 566 *
569 567 * Fix this by scaling these tick based values against the total runtime
570 while (new > (old = READ_ONCE(*counter))) 568 * accounted by the CFS scheduler.
571 cmpxchg_cputime(counter, old, new); 569 *
572} 570 * This code provides the following guarantees:
573 571 *
574/* 572 * stime + utime == rtime
575 * Adjust tick based cputime random precision against scheduler 573 * stime_i+1 >= stime_i, utime_i+1 >= utime_i
576 * runtime accounting. 574 *
575 * Assuming that rtime_i+1 >= rtime_i.
577 */ 576 */
578static void cputime_adjust(struct task_cputime *curr, 577static void cputime_adjust(struct task_cputime *curr,
579 struct cputime *prev, 578 struct prev_cputime *prev,
580 cputime_t *ut, cputime_t *st) 579 cputime_t *ut, cputime_t *st)
581{ 580{
582 cputime_t rtime, stime, utime; 581 cputime_t rtime, stime, utime;
582 unsigned long flags;
583 583
584 /* 584 /* Serialize concurrent callers such that we can honour our guarantees */
585 * Tick based cputime accounting depend on random scheduling 585 raw_spin_lock_irqsave(&prev->lock, flags);
586 * timeslices of a task to be interrupted or not by the timer.
587 * Depending on these circumstances, the number of these interrupts
588 * may be over or under-optimistic, matching the real user and system
589 * cputime with a variable precision.
590 *
591 * Fix this by scaling these tick based values against the total
592 * runtime accounted by the CFS scheduler.
593 */
594 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 586 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
595 587
596 /* 588 /*
597 * Update userspace visible utime/stime values only if actual execution 589 * This is possible under two circumstances:
598 * time is bigger than already exported. Note that can happen, that we 590 * - rtime isn't monotonic after all (a bug);
599 * provided bigger values due to scaling inaccuracy on big numbers. 591 * - we got reordered by the lock.
592 *
593 * In both cases this acts as a filter such that the rest of the code
594 * can assume it is monotonic regardless of anything else.
600 */ 595 */
601 if (prev->stime + prev->utime >= rtime) 596 if (prev->stime + prev->utime >= rtime)
602 goto out; 597 goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
606 601
607 if (utime == 0) { 602 if (utime == 0) {
608 stime = rtime; 603 stime = rtime;
609 } else if (stime == 0) { 604 goto update;
610 utime = rtime; 605 }
611 } else {
612 cputime_t total = stime + utime;
613 606
614 stime = scale_stime((__force u64)stime, 607 if (stime == 0) {
615 (__force u64)rtime, (__force u64)total); 608 utime = rtime;
616 utime = rtime - stime; 609 goto update;
617 } 610 }
618 611
619 cputime_advance(&prev->stime, stime); 612 stime = scale_stime((__force u64)stime, (__force u64)rtime,
620 cputime_advance(&prev->utime, utime); 613 (__force u64)(stime + utime));
614
615 /*
616 * Make sure stime doesn't go backwards; this preserves monotonicity
617 * for utime because rtime is monotonic.
618 *
619 * utime_i+1 = rtime_i+1 - stime_i
620 * = rtime_i+1 - (rtime_i - utime_i)
621 * = (rtime_i+1 - rtime_i) + utime_i
622 * >= utime_i
623 */
624 if (stime < prev->stime)
625 stime = prev->stime;
626 utime = rtime - stime;
627
628 /*
629 * Make sure utime doesn't go backwards; this still preserves
630 * monotonicity for stime, analogous argument to above.
631 */
632 if (utime < prev->utime) {
633 utime = prev->utime;
634 stime = rtime - utime;
635 }
621 636
637update:
638 prev->stime = stime;
639 prev->utime = utime;
622out: 640out:
623 *ut = prev->utime; 641 *ut = prev->utime;
624 *st = prev->stime; 642 *st = prev->stime;
643 raw_spin_unlock_irqrestore(&prev->lock, flags);
625} 644}
626 645
627void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 646void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)