aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2013-01-26 11:19:42 -0500
committerIngo Molnar <mingo@kernel.org>2013-01-27 08:04:44 -0500
commit62188451f0d63add7ad0cd2a1ae269d600c1663d (patch)
tree7ed03f5c0a4a10cdde5f72bb0c81a29f1667ce1d /kernel/sched
parent57d2aa00dcec67afa52478730f2b524521af14fb (diff)
cputime: Avoid multiplication overflow on utime scaling
We scale stime, utime values based on rtime (sum_exec_runtime converted to jiffies). During scaling we multiple rtime * utime, which seems to be fine, since both values are converted to u64, but it's not. Let assume HZ is 1000 - 1ms tick. Process consist of 64 threads, run for 1 day, threads utilize 100% cpu on user space. Machine has 64 cpus. Process rtime = utime will be 64 * 24 * 60 * 60 * 1000 jiffies, which is 0x149970000. Multiplication rtime * utime result is 0x1a855771100000000, which can not be covered in 64 bits. Result of overflow is stall of utime values visible in user space (prev_utime in kernel), even if application still consume lot of CPU time. A solution to solve this is to perform the multiplication on stime instead of utime. It's easy to grow the utime value fast with a CPU bound thread in userspace for example. Now we assume that doing so with stime is much harder. In most cases a task shouldn't ever spend much time in kernel space as it tends to sleep waiting for jobs completion when they take long to achieve. IO is the typical example of that. Hence scaling the cputime by performing the multiplication on stime instead of utime should considerably reduce the chances of an overflow on most workloads. This is largely inspired by a patch from Stanislaw Gruszka: http://lkml.kernel.org/r/20130107113144.GA7544@redhat.com Inspired-by: Stanislaw Gruszka <sgruszka@redhat.com> Reported-by: Stanislaw Gruszka <sgruszka@redhat.com> Acked-by: Stanislaw Gruszka <sgruszka@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1359217182-25184-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/cputime.c18
1 files changed, 9 insertions, 9 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..825a956ccdb6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -509,11 +509,11 @@ EXPORT_SYMBOL_GPL(vtime_account);
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif 510#endif
511 511
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) 512static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
513{ 513{
514 u64 temp = (__force u64) rtime; 514 u64 temp = (__force u64) rtime;
515 515
516 temp *= (__force u64) utime; 516 temp *= (__force u64) stime;
517 517
518 if (sizeof(cputime_t) == 4) 518 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total); 519 temp = div_u64(temp, (__force u32) total);
@@ -531,10 +531,10 @@ static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev, 531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st) 532 cputime_t *ut, cputime_t *st)
533{ 533{
534 cputime_t rtime, utime, total; 534 cputime_t rtime, stime, total;
535 535
536 utime = curr->utime; 536 stime = curr->stime;
537 total = utime + curr->stime; 537 total = stime + curr->utime;
538 538
539 /* 539 /*
540 * Tick based cputime accounting depend on random scheduling 540 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +549,17 @@ static void cputime_adjust(struct task_cputime *curr,
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550 550
551 if (total) 551 if (total)
552 utime = scale_utime(utime, rtime, total); 552 stime = scale_stime(stime, rtime, total);
553 else 553 else
554 utime = rtime; 554 stime = rtime;
555 555
556 /* 556 /*
557 * If the tick based count grows faster than the scheduler one, 557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward. 558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity. 559 * Let's enforce monotonicity.
560 */ 560 */
561 prev->utime = max(prev->utime, utime); 561 prev->stime = max(prev->stime, stime);
562 prev->stime = max(prev->stime, rtime - prev->utime); 562 prev->utime = max(prev->utime, rtime - prev->stime);
563 563
564 *ut = prev->utime; 564 *ut = prev->utime;
565 *st = prev->stime; 565 *st = prev->stime;