time, signal: Protect resource use statistics with seqlock

Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Alex Thorlton <athorlton@sgi.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Daeseok Youn <daeseok.youn@gmail.com> Cc: David Rientjes <rientjes@google.com> Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guillaume Morin <guillaume@morinfr.org> Cc: Ionut Alexa <ionut.m.alexa@gmail.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Michal Schmidt <mschmidt@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Vladimir Davydov <vdavydov@parallels.com> Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Rik van Riel <riel@redhat.com> 2014-08-16 13:40:10 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-09-08 02:17:01 -0400
commit: e78c3496790ee8a36522a838b59b388e8a709e65 (patch)
tree: 0473b9ea676754d50b19eb1a862ac16fdffacbeb /kernel/sched
parent: 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 (diff)
1 files changed, 20 insertions, 13 deletions
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3e52836359ba..49b7cfe98f7a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
+        unsigned int seq, nextseq;
-        times->utime = sig->utime;
-        times->stime = sig->stime;
-        times->sum_exec_runtime = sig->sum_sched_runtime;
        rcu_read_lock();
-        for_each_thread(tsk, t) {
+        /* Attempt a lockless read on the first round. */
-                task_cputime(t, &utime, &stime);
+        nextseq = 0;
-                times->utime += utime;
+        do {
-                times->stime += stime;
+                seq = nextseq;
-                times->sum_exec_runtime += task_sched_runtime(t);
+                read_seqbegin_or_lock(&sig->stats_lock, &seq);
-        }
+                times->utime = sig->utime;
+                times->stime = sig->stime;
+                times->sum_exec_runtime = sig->sum_sched_runtime;
+                for_each_thread(tsk, t) {
+                        task_cputime(t, &utime, &stime);
+                        times->utime += utime;
+                        times->stime += stime;
+                        times->sum_exec_runtime += task_sched_runtime(t);
+                }
+                /* If lockless access failed, take the lock. */
+                nextseq = 1;
+        } while (need_seqretry(&sig->stats_lock, seq));
+        done_seqretry(&sig->stats_lock, seq);
        rcu_read_unlock();
 }
@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
author	Rik van Riel <riel@redhat.com>	2014-08-16 13:40:10 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-09-08 02:17:01 -0400
commit	e78c3496790ee8a36522a838b59b388e8a709e65 (patch)
tree	0473b9ea676754d50b19eb1a862ac16fdffacbeb /kernel/sched
parent	90ed9cbe765ad358b3151a12b8bf889a3cbcd573 (diff)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359ba..49b7cfe98f7a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c
@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct tsk, struct task_cputime times)
288	struct signal_struct *sig = tsk->signal;	288	struct signal_struct *sig = tsk->signal;
289	cputime_t utime, stime;	289	cputime_t utime, stime;
290	struct task_struct *t;	290	struct task_struct *t;
291		291	unsigned int seq, nextseq;
292	times->utime = sig->utime;
293	times->stime = sig->stime;
294	times->sum_exec_runtime = sig->sum_sched_runtime;
295		292
296	rcu_read_lock();	293	rcu_read_lock();
297	for_each_thread(tsk, t) {	294	/* Attempt a lockless read on the first round. */
298	task_cputime(t, &utime, &stime);	295	nextseq = 0;
299	times->utime += utime;	296	do {
300	times->stime += stime;	297	seq = nextseq;
301	times->sum_exec_runtime += task_sched_runtime(t);	298	read_seqbegin_or_lock(&sig->stats_lock, &seq);
302	}	299	times->utime = sig->utime;
		300	times->stime = sig->stime;
		301	times->sum_exec_runtime = sig->sum_sched_runtime;
		302
		303	for_each_thread(tsk, t) {
		304	task_cputime(t, &utime, &stime);
		305	times->utime += utime;
		306	times->stime += stime;
		307	times->sum_exec_runtime += task_sched_runtime(t);
		308	}
		309	/* If lockless access failed, take the lock. */
		310	nextseq = 1;
		311	} while (need_seqretry(&sig->stats_lock, seq));
		312	done_seqretry(&sig->stats_lock, seq);
303	rcu_read_unlock();	313	rcu_read_unlock();
304	}	314	}
305		315
@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct p, cputime_t ut, cputime_t *st)
611	cputime_adjust(&cputime, &p->prev_cputime, ut, st);	621	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
612	}	622	}
613		623
614	/*
615	* Must be called with siglock held.
616	*/
617	void thread_group_cputime_adjusted(struct task_struct p, cputime_t ut, cputime_t *st)	624	void thread_group_cputime_adjusted(struct task_struct p, cputime_t ut, cputime_t *st)
618	{	625	{
619	struct task_cputime cputime;	626	struct task_cputime cputime;