diff options
author | Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> | 2009-12-02 03:28:07 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-12-02 11:32:40 -0500 |
commit | 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 (patch) | |
tree | 6102662a9594d51155bee11666fe8517fcbe6039 | |
parent | d99ca3b977fc5a93141304f571475c2af9e6c1c5 (diff) |
sched, cputime: Introduce thread_group_times()
This is a real fix for problem of utime/stime values decreasing
described in the thread:
http://lkml.org/lkml/2009/11/3/522
Now cputime is accounted in the following way:
- {u,s}time in task_struct are increased every time when the thread
is interrupted by a tick (timer interrupt).
- When a thread exits, its {u,s}time are added to signal->{u,s}time,
after adjusted by task_times().
- When all threads in a thread_group exits, accumulated {u,s}time
(and also c{u,s}time) in signal struct are added to c{u,s}time
in signal struct of the group's parent.
So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.
And accounted values are used by:
- task_times(), to get cputime of a thread:
This function returns adjusted values that originates from raw
{u,s}time and scaled by sum_exec_runtime that accounted by CFS.
- thread_group_cputime(), to get cputime of a thread group:
This function returns sum of all {u,s}time of living threads in
the group, plus {u,s}time in the signal struct that is sum of
adjusted cputimes of all exited threads belonged to the group.
The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:
group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).
To fix this, we could do:
group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
But task_times() contains hard divisions, so applying it for
every thread should be avoided.
This patch fixes the above problem in the following way:
- Modify thread's exit (= __exit_signal()) not to use task_times().
It means {u,s}time in signal struct accumulates raw values instead
of adjusted values. As the result it makes thread_group_cputime()
to return pure sum of "raw" values.
- Introduce a new function thread_group_times(*task, *utime, *stime)
that converts "raw" values of thread_group_cputime() to "adjusted"
values, in same calculation procedure as task_times().
- Modify group's exit (= wait_task_zombie()) to use this introduced
thread_group_times(). It make c{u,s}time in signal struct to
have adjusted values like before this patch.
- Replace some thread_group_cputime() by thread_group_times().
This replacements are only applied where conveys the "adjusted"
cputime to users, and where already uses task_times() near by it.
(i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
This patch have a positive side effect:
- Before this patch, if a group contains many short-life threads
(e.g. runs 0.9ms and not interrupted by ticks), the group's
cputime could be invisible since thread's cputime was accumulated
after adjusted: imagine adjustment function as adj(ticks, runtime),
{adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
After this patch it will not happen because the adjustment is
applied after accumulated.
v2:
- remove if()s, put new variables into signal_struct.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | fs/proc/array.c | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/exit.c | 23 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/sched.c | 41 | ||||
-rw-r--r-- | kernel/sys.c | 18 |
6 files changed, 69 insertions, 25 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c index ca61a88aed66..2571da43c736 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
506 | 506 | ||
507 | /* add up live thread stats at the group level */ | 507 | /* add up live thread stats at the group level */ |
508 | if (whole) { | 508 | if (whole) { |
509 | struct task_cputime cputime; | ||
510 | struct task_struct *t = task; | 509 | struct task_struct *t = task; |
511 | do { | 510 | do { |
512 | min_flt += t->min_flt; | 511 | min_flt += t->min_flt; |
@@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
517 | 516 | ||
518 | min_flt += sig->min_flt; | 517 | min_flt += sig->min_flt; |
519 | maj_flt += sig->maj_flt; | 518 | maj_flt += sig->maj_flt; |
520 | thread_group_cputime(task, &cputime); | 519 | thread_group_times(task, &utime, &stime); |
521 | utime = cputime.utime; | ||
522 | stime = cputime.stime; | ||
523 | gtime = cputime_add(gtime, sig->gtime); | 520 | gtime = cputime_add(gtime, sig->gtime); |
524 | } | 521 | } |
525 | 522 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index dff85e58264e..34238bd10ebf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -624,6 +624,9 @@ struct signal_struct { | |||
624 | cputime_t utime, stime, cutime, cstime; | 624 | cputime_t utime, stime, cutime, cstime; |
625 | cputime_t gtime; | 625 | cputime_t gtime; |
626 | cputime_t cgtime; | 626 | cputime_t cgtime; |
627 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
628 | cputime_t prev_utime, prev_stime; | ||
629 | #endif | ||
627 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 630 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
628 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 631 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
629 | unsigned long inblock, oublock, cinblock, coublock; | 632 | unsigned long inblock, oublock, cinblock, coublock; |
@@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
1723 | } | 1726 | } |
1724 | 1727 | ||
1725 | extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); | 1728 | extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); |
1729 | extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); | ||
1726 | 1730 | ||
1727 | /* | 1731 | /* |
1728 | * Per process flags | 1732 | * Per process flags |
diff --git a/kernel/exit.c b/kernel/exit.c index 2eaf68b634e3..b221ad65fd20 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
91 | if (atomic_dec_and_test(&sig->count)) | 91 | if (atomic_dec_and_test(&sig->count)) |
92 | posix_cpu_timers_exit_group(tsk); | 92 | posix_cpu_timers_exit_group(tsk); |
93 | else { | 93 | else { |
94 | cputime_t utime, stime; | ||
95 | |||
96 | /* | 94 | /* |
97 | * If there is any task waiting for the group exit | 95 | * If there is any task waiting for the group exit |
98 | * then notify it: | 96 | * then notify it: |
@@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk) | |||
112 | * We won't ever get here for the group leader, since it | 110 | * We won't ever get here for the group leader, since it |
113 | * will have been the last reference on the signal_struct. | 111 | * will have been the last reference on the signal_struct. |
114 | */ | 112 | */ |
115 | task_times(tsk, &utime, &stime); | 113 | sig->utime = cputime_add(sig->utime, tsk->utime); |
116 | sig->utime = cputime_add(sig->utime, utime); | 114 | sig->stime = cputime_add(sig->stime, tsk->stime); |
117 | sig->stime = cputime_add(sig->stime, stime); | ||
118 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 115 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); |
119 | sig->min_flt += tsk->min_flt; | 116 | sig->min_flt += tsk->min_flt; |
120 | sig->maj_flt += tsk->maj_flt; | 117 | sig->maj_flt += tsk->maj_flt; |
@@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1208 | struct signal_struct *psig; | 1205 | struct signal_struct *psig; |
1209 | struct signal_struct *sig; | 1206 | struct signal_struct *sig; |
1210 | unsigned long maxrss; | 1207 | unsigned long maxrss; |
1208 | cputime_t tgutime, tgstime; | ||
1211 | 1209 | ||
1212 | /* | 1210 | /* |
1213 | * The resource counters for the group leader are in its | 1211 | * The resource counters for the group leader are in its |
@@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1223 | * need to protect the access to parent->signal fields, | 1221 | * need to protect the access to parent->signal fields, |
1224 | * as other threads in the parent group can be right | 1222 | * as other threads in the parent group can be right |
1225 | * here reaping other children at the same time. | 1223 | * here reaping other children at the same time. |
1224 | * | ||
1225 | * We use thread_group_times() to get times for the thread | ||
1226 | * group, which consolidates times for all threads in the | ||
1227 | * group including the group leader. | ||
1226 | */ | 1228 | */ |
1229 | thread_group_times(p, &tgutime, &tgstime); | ||
1227 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1230 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1228 | psig = p->real_parent->signal; | 1231 | psig = p->real_parent->signal; |
1229 | sig = p->signal; | 1232 | sig = p->signal; |
1230 | psig->cutime = | 1233 | psig->cutime = |
1231 | cputime_add(psig->cutime, | 1234 | cputime_add(psig->cutime, |
1232 | cputime_add(p->utime, | 1235 | cputime_add(tgutime, |
1233 | cputime_add(sig->utime, | 1236 | sig->cutime)); |
1234 | sig->cutime))); | ||
1235 | psig->cstime = | 1237 | psig->cstime = |
1236 | cputime_add(psig->cstime, | 1238 | cputime_add(psig->cstime, |
1237 | cputime_add(p->stime, | 1239 | cputime_add(tgstime, |
1238 | cputime_add(sig->stime, | 1240 | sig->cstime)); |
1239 | sig->cstime))); | ||
1240 | psig->cgtime = | 1241 | psig->cgtime = |
1241 | cputime_add(psig->cgtime, | 1242 | cputime_add(psig->cgtime, |
1242 | cputime_add(p->gtime, | 1243 | cputime_add(p->gtime, |
diff --git a/kernel/fork.c b/kernel/fork.c index ad7cb6d1193c..3d6f121bbe8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
884 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | 884 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
885 | sig->gtime = cputime_zero; | 885 | sig->gtime = cputime_zero; |
886 | sig->cgtime = cputime_zero; | 886 | sig->cgtime = cputime_zero; |
887 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
888 | sig->prev_utime = sig->prev_stime = cputime_zero; | ||
889 | #endif | ||
887 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 890 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
888 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | 891 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; |
889 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; | 892 | sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; |
diff --git a/kernel/sched.c b/kernel/sched.c index 17e2c1db2bde..e6ba726941ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
5187 | *ut = p->utime; | 5187 | *ut = p->utime; |
5188 | *st = p->stime; | 5188 | *st = p->stime; |
5189 | } | 5189 | } |
5190 | |||
5191 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
5192 | { | ||
5193 | struct task_cputime cputime; | ||
5194 | |||
5195 | thread_group_cputime(p, &cputime); | ||
5196 | |||
5197 | *ut = cputime.utime; | ||
5198 | *st = cputime.stime; | ||
5199 | } | ||
5190 | #else | 5200 | #else |
5191 | 5201 | ||
5192 | #ifndef nsecs_to_cputime | 5202 | #ifndef nsecs_to_cputime |
@@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
5220 | *ut = p->prev_utime; | 5230 | *ut = p->prev_utime; |
5221 | *st = p->prev_stime; | 5231 | *st = p->prev_stime; |
5222 | } | 5232 | } |
5233 | |||
5234 | /* | ||
5235 | * Must be called with siglock held. | ||
5236 | */ | ||
5237 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
5238 | { | ||
5239 | struct signal_struct *sig = p->signal; | ||
5240 | struct task_cputime cputime; | ||
5241 | cputime_t rtime, utime, total; | ||
5242 | |||
5243 | thread_group_cputime(p, &cputime); | ||
5244 | |||
5245 | total = cputime_add(cputime.utime, cputime.stime); | ||
5246 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
5247 | |||
5248 | if (total) { | ||
5249 | u64 temp; | ||
5250 | |||
5251 | temp = (u64)(rtime * cputime.utime); | ||
5252 | do_div(temp, total); | ||
5253 | utime = (cputime_t)temp; | ||
5254 | } else | ||
5255 | utime = rtime; | ||
5256 | |||
5257 | sig->prev_utime = max(sig->prev_utime, utime); | ||
5258 | sig->prev_stime = max(sig->prev_stime, | ||
5259 | cputime_sub(rtime, sig->prev_utime)); | ||
5260 | |||
5261 | *ut = sig->prev_utime; | ||
5262 | *st = sig->prev_stime; | ||
5263 | } | ||
5223 | #endif | 5264 | #endif |
5224 | 5265 | ||
5225 | /* | 5266 | /* |
diff --git a/kernel/sys.c b/kernel/sys.c index bbdfce0d4347..9968c5fb55b9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -911,16 +911,15 @@ change_okay: | |||
911 | 911 | ||
912 | void do_sys_times(struct tms *tms) | 912 | void do_sys_times(struct tms *tms) |
913 | { | 913 | { |
914 | struct task_cputime cputime; | 914 | cputime_t tgutime, tgstime, cutime, cstime; |
915 | cputime_t cutime, cstime; | ||
916 | 915 | ||
917 | thread_group_cputime(current, &cputime); | ||
918 | spin_lock_irq(¤t->sighand->siglock); | 916 | spin_lock_irq(¤t->sighand->siglock); |
917 | thread_group_times(current, &tgutime, &tgstime); | ||
919 | cutime = current->signal->cutime; | 918 | cutime = current->signal->cutime; |
920 | cstime = current->signal->cstime; | 919 | cstime = current->signal->cstime; |
921 | spin_unlock_irq(¤t->sighand->siglock); | 920 | spin_unlock_irq(¤t->sighand->siglock); |
922 | tms->tms_utime = cputime_to_clock_t(cputime.utime); | 921 | tms->tms_utime = cputime_to_clock_t(tgutime); |
923 | tms->tms_stime = cputime_to_clock_t(cputime.stime); | 922 | tms->tms_stime = cputime_to_clock_t(tgstime); |
924 | tms->tms_cutime = cputime_to_clock_t(cutime); | 923 | tms->tms_cutime = cputime_to_clock_t(cutime); |
925 | tms->tms_cstime = cputime_to_clock_t(cstime); | 924 | tms->tms_cstime = cputime_to_clock_t(cstime); |
926 | } | 925 | } |
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1338 | { | 1337 | { |
1339 | struct task_struct *t; | 1338 | struct task_struct *t; |
1340 | unsigned long flags; | 1339 | unsigned long flags; |
1341 | cputime_t utime, stime; | 1340 | cputime_t tgutime, tgstime, utime, stime; |
1342 | struct task_cputime cputime; | ||
1343 | unsigned long maxrss = 0; | 1341 | unsigned long maxrss = 0; |
1344 | 1342 | ||
1345 | memset((char *) r, 0, sizeof *r); | 1343 | memset((char *) r, 0, sizeof *r); |
@@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1372 | break; | 1370 | break; |
1373 | 1371 | ||
1374 | case RUSAGE_SELF: | 1372 | case RUSAGE_SELF: |
1375 | thread_group_cputime(p, &cputime); | 1373 | thread_group_times(p, &tgutime, &tgstime); |
1376 | utime = cputime_add(utime, cputime.utime); | 1374 | utime = cputime_add(utime, tgutime); |
1377 | stime = cputime_add(stime, cputime.stime); | 1375 | stime = cputime_add(stime, tgstime); |
1378 | r->ru_nvcsw += p->signal->nvcsw; | 1376 | r->ru_nvcsw += p->signal->nvcsw; |
1379 | r->ru_nivcsw += p->signal->nivcsw; | 1377 | r->ru_nivcsw += p->signal->nivcsw; |
1380 | r->ru_minflt += p->signal->min_flt; | 1378 | r->ru_minflt += p->signal->min_flt; |