aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>2009-12-02 03:28:07 -0500
committerIngo Molnar <mingo@elte.hu>2009-12-02 11:32:40 -0500
commit0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 (patch)
tree6102662a9594d51155bee11666fe8517fcbe6039
parentd99ca3b977fc5a93141304f571475c2af9e6c1c5 (diff)
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Spencer Candland <spencer@bluehost.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--fs/proc/array.c5
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched.c41
-rw-r--r--kernel/sys.c18
6 files changed, 69 insertions, 25 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ca61a88aed66..2571da43c736 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
506 506
507 /* add up live thread stats at the group level */ 507 /* add up live thread stats at the group level */
508 if (whole) { 508 if (whole) {
509 struct task_cputime cputime;
510 struct task_struct *t = task; 509 struct task_struct *t = task;
511 do { 510 do {
512 min_flt += t->min_flt; 511 min_flt += t->min_flt;
@@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
517 516
518 min_flt += sig->min_flt; 517 min_flt += sig->min_flt;
519 maj_flt += sig->maj_flt; 518 maj_flt += sig->maj_flt;
520 thread_group_cputime(task, &cputime); 519 thread_group_times(task, &utime, &stime);
521 utime = cputime.utime;
522 stime = cputime.stime;
523 gtime = cputime_add(gtime, sig->gtime); 520 gtime = cputime_add(gtime, sig->gtime);
524 } 521 }
525 522
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dff85e58264e..34238bd10ebf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -624,6 +624,9 @@ struct signal_struct {
624 cputime_t utime, stime, cutime, cstime; 624 cputime_t utime, stime, cutime, cstime;
625 cputime_t gtime; 625 cputime_t gtime;
626 cputime_t cgtime; 626 cputime_t cgtime;
627#ifndef CONFIG_VIRT_CPU_ACCOUNTING
628 cputime_t prev_utime, prev_stime;
629#endif
627 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; 630 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
628 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; 631 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
629 unsigned long inblock, oublock, cinblock, coublock; 632 unsigned long inblock, oublock, cinblock, coublock;
@@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t)
1723} 1726}
1724 1727
1725extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); 1728extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
1729extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
1726 1730
1727/* 1731/*
1728 * Per process flags 1732 * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index 2eaf68b634e3..b221ad65fd20 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk)
91 if (atomic_dec_and_test(&sig->count)) 91 if (atomic_dec_and_test(&sig->count))
92 posix_cpu_timers_exit_group(tsk); 92 posix_cpu_timers_exit_group(tsk);
93 else { 93 else {
94 cputime_t utime, stime;
95
96 /* 94 /*
97 * If there is any task waiting for the group exit 95 * If there is any task waiting for the group exit
98 * then notify it: 96 * then notify it:
@@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 110 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 111 * will have been the last reference on the signal_struct.
114 */ 112 */
115 task_times(tsk, &utime, &stime); 113 sig->utime = cputime_add(sig->utime, tsk->utime);
116 sig->utime = cputime_add(sig->utime, utime); 114 sig->stime = cputime_add(sig->stime, tsk->stime);
117 sig->stime = cputime_add(sig->stime, stime);
118 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 115 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
119 sig->min_flt += tsk->min_flt; 116 sig->min_flt += tsk->min_flt;
120 sig->maj_flt += tsk->maj_flt; 117 sig->maj_flt += tsk->maj_flt;
@@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1208 struct signal_struct *psig; 1205 struct signal_struct *psig;
1209 struct signal_struct *sig; 1206 struct signal_struct *sig;
1210 unsigned long maxrss; 1207 unsigned long maxrss;
1208 cputime_t tgutime, tgstime;
1211 1209
1212 /* 1210 /*
1213 * The resource counters for the group leader are in its 1211 * The resource counters for the group leader are in its
@@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1223 * need to protect the access to parent->signal fields, 1221 * need to protect the access to parent->signal fields,
1224 * as other threads in the parent group can be right 1222 * as other threads in the parent group can be right
1225 * here reaping other children at the same time. 1223 * here reaping other children at the same time.
1224 *
1225 * We use thread_group_times() to get times for the thread
1226 * group, which consolidates times for all threads in the
1227 * group including the group leader.
1226 */ 1228 */
1229 thread_group_times(p, &tgutime, &tgstime);
1227 spin_lock_irq(&p->real_parent->sighand->siglock); 1230 spin_lock_irq(&p->real_parent->sighand->siglock);
1228 psig = p->real_parent->signal; 1231 psig = p->real_parent->signal;
1229 sig = p->signal; 1232 sig = p->signal;
1230 psig->cutime = 1233 psig->cutime =
1231 cputime_add(psig->cutime, 1234 cputime_add(psig->cutime,
1232 cputime_add(p->utime, 1235 cputime_add(tgutime,
1233 cputime_add(sig->utime, 1236 sig->cutime));
1234 sig->cutime)));
1235 psig->cstime = 1237 psig->cstime =
1236 cputime_add(psig->cstime, 1238 cputime_add(psig->cstime,
1237 cputime_add(p->stime, 1239 cputime_add(tgstime,
1238 cputime_add(sig->stime, 1240 sig->cstime));
1239 sig->cstime)));
1240 psig->cgtime = 1241 psig->cgtime =
1241 cputime_add(psig->cgtime, 1242 cputime_add(psig->cgtime,
1242 cputime_add(p->gtime, 1243 cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index ad7cb6d1193c..3d6f121bbe8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 884 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
885 sig->gtime = cputime_zero; 885 sig->gtime = cputime_zero;
886 sig->cgtime = cputime_zero; 886 sig->cgtime = cputime_zero;
887#ifndef CONFIG_VIRT_CPU_ACCOUNTING
888 sig->prev_utime = sig->prev_stime = cputime_zero;
889#endif
887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 890 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 891 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 892 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 17e2c1db2bde..e6ba726941ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5187 *ut = p->utime; 5187 *ut = p->utime;
5188 *st = p->stime; 5188 *st = p->stime;
5189} 5189}
5190
5191void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5192{
5193 struct task_cputime cputime;
5194
5195 thread_group_cputime(p, &cputime);
5196
5197 *ut = cputime.utime;
5198 *st = cputime.stime;
5199}
5190#else 5200#else
5191 5201
5192#ifndef nsecs_to_cputime 5202#ifndef nsecs_to_cputime
@@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5220 *ut = p->prev_utime; 5230 *ut = p->prev_utime;
5221 *st = p->prev_stime; 5231 *st = p->prev_stime;
5222} 5232}
5233
5234/*
5235 * Must be called with siglock held.
5236 */
5237void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5238{
5239 struct signal_struct *sig = p->signal;
5240 struct task_cputime cputime;
5241 cputime_t rtime, utime, total;
5242
5243 thread_group_cputime(p, &cputime);
5244
5245 total = cputime_add(cputime.utime, cputime.stime);
5246 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5247
5248 if (total) {
5249 u64 temp;
5250
5251 temp = (u64)(rtime * cputime.utime);
5252 do_div(temp, total);
5253 utime = (cputime_t)temp;
5254 } else
5255 utime = rtime;
5256
5257 sig->prev_utime = max(sig->prev_utime, utime);
5258 sig->prev_stime = max(sig->prev_stime,
5259 cputime_sub(rtime, sig->prev_utime));
5260
5261 *ut = sig->prev_utime;
5262 *st = sig->prev_stime;
5263}
5223#endif 5264#endif
5224 5265
5225/* 5266/*
diff --git a/kernel/sys.c b/kernel/sys.c
index bbdfce0d4347..9968c5fb55b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
911 911
912void do_sys_times(struct tms *tms) 912void do_sys_times(struct tms *tms)
913{ 913{
914 struct task_cputime cputime; 914 cputime_t tgutime, tgstime, cutime, cstime;
915 cputime_t cutime, cstime;
916 915
917 thread_group_cputime(current, &cputime);
918 spin_lock_irq(&current->sighand->siglock); 916 spin_lock_irq(&current->sighand->siglock);
917 thread_group_times(current, &tgutime, &tgstime);
919 cutime = current->signal->cutime; 918 cutime = current->signal->cutime;
920 cstime = current->signal->cstime; 919 cstime = current->signal->cstime;
921 spin_unlock_irq(&current->sighand->siglock); 920 spin_unlock_irq(&current->sighand->siglock);
922 tms->tms_utime = cputime_to_clock_t(cputime.utime); 921 tms->tms_utime = cputime_to_clock_t(tgutime);
923 tms->tms_stime = cputime_to_clock_t(cputime.stime); 922 tms->tms_stime = cputime_to_clock_t(tgstime);
924 tms->tms_cutime = cputime_to_clock_t(cutime); 923 tms->tms_cutime = cputime_to_clock_t(cutime);
925 tms->tms_cstime = cputime_to_clock_t(cstime); 924 tms->tms_cstime = cputime_to_clock_t(cstime);
926} 925}
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338{ 1337{
1339 struct task_struct *t; 1338 struct task_struct *t;
1340 unsigned long flags; 1339 unsigned long flags;
1341 cputime_t utime, stime; 1340 cputime_t tgutime, tgstime, utime, stime;
1342 struct task_cputime cputime;
1343 unsigned long maxrss = 0; 1341 unsigned long maxrss = 0;
1344 1342
1345 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
@@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1372 break; 1370 break;
1373 1371
1374 case RUSAGE_SELF: 1372 case RUSAGE_SELF:
1375 thread_group_cputime(p, &cputime); 1373 thread_group_times(p, &tgutime, &tgstime);
1376 utime = cputime_add(utime, cputime.utime); 1374 utime = cputime_add(utime, tgutime);
1377 stime = cputime_add(stime, cputime.stime); 1375 stime = cputime_add(stime, tgstime);
1378 r->ru_nvcsw += p->signal->nvcsw; 1376 r->ru_nvcsw += p->signal->nvcsw;
1379 r->ru_nivcsw += p->signal->nivcsw; 1377 r->ru_nivcsw += p->signal->nivcsw;
1380 r->ru_minflt += p->signal->min_flt; 1378 r->ru_minflt += p->signal->min_flt;