diff options
author | Rik van Riel <riel@redhat.com> | 2014-08-16 13:40:10 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-09-08 02:17:01 -0400 |
commit | e78c3496790ee8a36522a838b59b388e8a709e65 (patch) | |
tree | 0473b9ea676754d50b19eb1a862ac16fdffacbeb /kernel | |
parent | 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 (diff) |
time, signal: Protect resource use statistics with seqlock
Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.
The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).
Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.
This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling
functions.
In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().
This way the statistics reporting code can run lockless.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daeseok Youn <daeseok.youn@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: umgwanakikbuti@gmail.com
Cc: fweisbec@gmail.com
Cc: srao@redhat.com
Cc: lwoodman@redhat.com
Cc: atheurer@redhat.com
Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 33 | ||||
-rw-r--r-- | kernel/sys.c | 2 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 14 |
5 files changed, 25 insertions, 29 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index b93d46dab6fc..fa09b86609db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
127 | * the signal_struct. | 127 | * the signal_struct. |
128 | */ | 128 | */ |
129 | task_cputime(tsk, &utime, &stime); | 129 | task_cputime(tsk, &utime, &stime); |
130 | write_seqlock(&sig->stats_lock); | ||
130 | sig->utime += utime; | 131 | sig->utime += utime; |
131 | sig->stime += stime; | 132 | sig->stime += stime; |
132 | sig->gtime += task_gtime(tsk); | 133 | sig->gtime += task_gtime(tsk); |
@@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
140 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | 141 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; |
141 | sig->nr_threads--; | 142 | sig->nr_threads--; |
142 | __unhash_process(tsk, group_dead); | 143 | __unhash_process(tsk, group_dead); |
144 | write_sequnlock(&sig->stats_lock); | ||
143 | 145 | ||
144 | /* | 146 | /* |
145 | * Do this under ->siglock, we can race with another thread | 147 | * Do this under ->siglock, we can race with another thread |
@@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1042 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1044 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1043 | psig = p->real_parent->signal; | 1045 | psig = p->real_parent->signal; |
1044 | sig = p->signal; | 1046 | sig = p->signal; |
1047 | write_seqlock(&psig->stats_lock); | ||
1045 | psig->cutime += tgutime + sig->cutime; | 1048 | psig->cutime += tgutime + sig->cutime; |
1046 | psig->cstime += tgstime + sig->cstime; | 1049 | psig->cstime += tgstime + sig->cstime; |
1047 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | 1050 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
@@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1064 | psig->cmaxrss = maxrss; | 1067 | psig->cmaxrss = maxrss; |
1065 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1068 | task_io_accounting_add(&psig->ioac, &p->ioac); |
1066 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1069 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
1070 | write_sequnlock(&psig->stats_lock); | ||
1067 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1071 | spin_unlock_irq(&p->real_parent->sighand->siglock); |
1068 | } | 1072 | } |
1069 | 1073 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..9387ae8ab048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1068 | sig->curr_target = tsk; | 1068 | sig->curr_target = tsk; |
1069 | init_sigpending(&sig->shared_pending); | 1069 | init_sigpending(&sig->shared_pending); |
1070 | INIT_LIST_HEAD(&sig->posix_timers); | 1070 | INIT_LIST_HEAD(&sig->posix_timers); |
1071 | seqlock_init(&sig->stats_lock); | ||
1071 | 1072 | ||
1072 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1073 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1073 | sig->real_timer.function = it_real_fn; | 1074 | sig->real_timer.function = it_real_fn; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359ba..49b7cfe98f7a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
288 | struct signal_struct *sig = tsk->signal; | 288 | struct signal_struct *sig = tsk->signal; |
289 | cputime_t utime, stime; | 289 | cputime_t utime, stime; |
290 | struct task_struct *t; | 290 | struct task_struct *t; |
291 | 291 | unsigned int seq, nextseq; | |
292 | times->utime = sig->utime; | ||
293 | times->stime = sig->stime; | ||
294 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
295 | 292 | ||
296 | rcu_read_lock(); | 293 | rcu_read_lock(); |
297 | for_each_thread(tsk, t) { | 294 | /* Attempt a lockless read on the first round. */ |
298 | task_cputime(t, &utime, &stime); | 295 | nextseq = 0; |
299 | times->utime += utime; | 296 | do { |
300 | times->stime += stime; | 297 | seq = nextseq; |
301 | times->sum_exec_runtime += task_sched_runtime(t); | 298 | read_seqbegin_or_lock(&sig->stats_lock, &seq); |
302 | } | 299 | times->utime = sig->utime; |
300 | times->stime = sig->stime; | ||
301 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
302 | |||
303 | for_each_thread(tsk, t) { | ||
304 | task_cputime(t, &utime, &stime); | ||
305 | times->utime += utime; | ||
306 | times->stime += stime; | ||
307 | times->sum_exec_runtime += task_sched_runtime(t); | ||
308 | } | ||
309 | /* If lockless access failed, take the lock. */ | ||
310 | nextseq = 1; | ||
311 | } while (need_seqretry(&sig->stats_lock, seq)); | ||
312 | done_seqretry(&sig->stats_lock, seq); | ||
303 | rcu_read_unlock(); | 313 | rcu_read_unlock(); |
304 | } | 314 | } |
305 | 315 | ||
@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
611 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 621 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
612 | } | 622 | } |
613 | 623 | ||
614 | /* | ||
615 | * Must be called with siglock held. | ||
616 | */ | ||
617 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 624 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
618 | { | 625 | { |
619 | struct task_cputime cputime; | 626 | struct task_cputime cputime; |
diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..b6636643cbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) | |||
862 | { | 862 | { |
863 | cputime_t tgutime, tgstime, cutime, cstime; | 863 | cputime_t tgutime, tgstime, cutime, cstime; |
864 | 864 | ||
865 | spin_lock_irq(¤t->sighand->siglock); | ||
866 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 865 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
867 | cutime = current->signal->cutime; | 866 | cutime = current->signal->cutime; |
868 | cstime = current->signal->cstime; | 867 | cstime = current->signal->cstime; |
869 | spin_unlock_irq(¤t->sighand->siglock); | ||
870 | tms->tms_utime = cputime_to_clock_t(tgutime); | 868 | tms->tms_utime = cputime_to_clock_t(tgutime); |
871 | tms->tms_stime = cputime_to_clock_t(tgstime); | 869 | tms->tms_stime = cputime_to_clock_t(tgstime); |
872 | tms->tms_cutime = cputime_to_clock_t(cutime); | 870 | tms->tms_cutime = cputime_to_clock_t(cutime); |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
272 | if (same_thread_group(tsk, current)) | 272 | if (same_thread_group(tsk, current)) |
273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); |
274 | } else { | 274 | } else { |
275 | unsigned long flags; | ||
276 | struct sighand_struct *sighand; | ||
277 | |||
278 | /* | ||
279 | * while_each_thread() is not yet entirely RCU safe, | ||
280 | * keep locking the group while sampling process | ||
281 | * clock for now. | ||
282 | */ | ||
283 | sighand = lock_task_sighand(tsk, &flags); | ||
284 | if (!sighand) | ||
285 | return err; | ||
286 | |||
287 | if (tsk == current || thread_group_leader(tsk)) | 275 | if (tsk == current || thread_group_leader(tsk)) |
288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | 276 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); |
289 | |||
290 | unlock_task_sighand(tsk, &flags); | ||
291 | } | 277 | } |
292 | 278 | ||
293 | if (!err) | 279 | if (!err) |