posix-cpu-timers: Cure SMP accounting oddities

David reported: Attached below is a watered-down version of rt/tst-cpuclock2.c from GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or similar. Run it several times, and you will see cases where the main thread will measure a process clock difference before and after the nanosleep which is smaller than the cpu-burner thread's individual thread clock difference. This doesn't make any sense since the cpu-burner thread is part of the top-level process's thread group. I've reproduced this on both x86-64 and sparc64 (using both 32-bit and 64-bit binaries). For example: [davem@boricha build-x86_64-linux]$ ./test process: before(0.001221967) after(0.498624371) diff(497402404) thread: before(0.000081692) after(0.498316431) diff(498234739) self: before(0.001223521) after(0.001240219) diff(16698) [davem@boricha build-x86_64-linux]$ The diff of 'process' should always be >= the diff of 'thread'. I make sure to wrap the 'thread' clock measurements the most tightly around the nanosleep() call, and that the 'process' clock measurements are the outer-most ones. --- #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <fcntl.h> #include <string.h> #include <errno.h> #include <pthread.h> static pthread_barrier_t barrier; static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) __asm__ __volatile__("" : : : "memory"); return NULL; } int main(void) { clockid_t process_clock, my_thread_clock, th_clock; struct timespec process_before, process_after; struct timespec me_before, me_after; struct timespec th_before, th_after; struct timespec sleeptime; unsigned long diff; pthread_t th; int err; err = clock_getcpuclockid(0, &process_clock); if (err) return 1; err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); if (err) return 1; pthread_barrier_init(&barrier, NULL, 2); err = pthread_create(&th, NULL, chew_cpu, NULL); if (err) return 1; err = pthread_getcpuclockid(th, &th_clock); if (err) return 1; pthread_barrier_wait(&barrier); err = clock_gettime(process_clock, &process_before); if (err) return 1; err = clock_gettime(my_thread_clock, &me_before); if (err) return 1; err = clock_gettime(th_clock, &th_before); if (err) return 1; sleeptime.tv_sec = 0; sleeptime.tv_nsec = 500000000; nanosleep(&sleeptime, NULL); err = clock_gettime(th_clock, &th_after); if (err) return 1; err = clock_gettime(my_thread_clock, &me_after); if (err) return 1; err = clock_gettime(process_clock, &process_after); if (err) return 1; diff = process_after.tv_nsec - process_before.tv_nsec; printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", process_before.tv_sec, process_before.tv_nsec, process_after.tv_sec, process_after.tv_nsec, diff); diff = th_after.tv_nsec - th_before.tv_nsec; printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", th_before.tv_sec, th_before.tv_nsec, th_after.tv_sec, th_after.tv_nsec, diff); diff = me_after.tv_nsec - me_before.tv_nsec; printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", me_before.tv_sec, me_before.tv_nsec, me_after.tv_sec, me_after.tv_nsec, diff); return 0; } This is due to us using p->se.sum_exec_runtime in thread_group_cputime() where we iterate the thread group and sum all data. This does not take time since the last schedule operation (tick or otherwise) into account. We can cure this by using task_sched_runtime() at the cost of having to take locks. This also means we can (and must) do away with thread_group_sched_runtime() since the modified thread_group_cputime() is now more accurate and would deadlock when called from thread_group_sched_runtime(). Reported-by: David Miller <davem@davemloft.net> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins Cc: stable@kernel.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2011-09-01 06:42:04 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2011-09-08 09:25:52 -0400
commit: e8abccb719377af63cb0f1fed289db405e3def16 (patch)
tree: 4f5292b8c170019e35016d5b5528298c877567ba /kernel
parent: 4f37a68cdaf6dea833cfdded2a3e0c47c0f006da (diff)
2 files changed, 3 insertions, 26 deletions
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e7..c8008dd58ef2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        do {
                times->utime = cputime_add(times->utime, t->utime);
                times->stime = cputime_add(times->stime, t->stime);
-                times->sum_exec_runtime += t->se.sum_exec_runtime;
+                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
        rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                cpu->cpu = cputime.utime;
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = thread_group_sched_runtime(p);
+                thread_group_cputime(p, &cputime);
+                cpu->sched = cputime.sum_exec_runtime;
                break;
        }
        return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index ccacdbdecf45..e1290ecee3c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3725,30 +3725,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 }
 /*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-        struct task_cputime totals;
-        unsigned long flags;
-        struct rq *rq;
-        u64 ns;
-        rq = task_rq_lock(p, &flags);
-        thread_group_cputime(p, &totals);
-        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, p, &flags);
-        return ns;
-}
-/*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @cputime: the cpu time spent in user space since the last update
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2011-09-01 06:42:04 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2011-09-08 09:25:52 -0400
commit	e8abccb719377af63cb0f1fed289db405e3def16 (patch)
tree	4f5292b8c170019e35016d5b5528298c877567ba /kernel
parent	4f37a68cdaf6dea833cfdded2a3e0c47c0f006da (diff)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..c8008dd58ef2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct tsk, struct task_cputime times)
250	do {	250	do {
251	times->utime = cputime_add(times->utime, t->utime);	251	times->utime = cputime_add(times->utime, t->utime);
252	times->stime = cputime_add(times->stime, t->stime);	252	times->stime = cputime_add(times->stime, t->stime);
253	times->sum_exec_runtime += t->se.sum_exec_runtime;	253	times->sum_exec_runtime += task_sched_runtime(t);
254	} while_each_thread(tsk, t);	254	} while_each_thread(tsk, t);
255	out:	255	out:
256	rcu_read_unlock();	256	rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
312	cpu->cpu = cputime.utime;	312	cpu->cpu = cputime.utime;
313	break;	313	break;
314	case CPUCLOCK_SCHED:	314	case CPUCLOCK_SCHED:
315	cpu->sched = thread_group_sched_runtime(p);	315	thread_group_cputime(p, &cputime);
		316	cpu->sched = cputime.sum_exec_runtime;
316	break;	317	break;
317	}	318	}
318	return 0;	319	return 0;


diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..e1290ecee3c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -3725,30 +3725,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3725	}	3725	}
3726		3726
3727	/*	3727	/*
3728	* Return sum_exec_runtime for the thread group.
3729	* In case the task is currently running, return the sum plus current's
3730	* pending runtime that have not been accounted yet.
3731	*
3732	* Note that the thread group might have other running tasks as well,
3733	* so the return value not includes other pending runtime that other
3734	* running tasks might have.
3735	*/
3736	unsigned long long thread_group_sched_runtime(struct task_struct *p)
3737	{
3738	struct task_cputime totals;
3739	unsigned long flags;
3740	struct rq *rq;
3741	u64 ns;
3742
3743	rq = task_rq_lock(p, &flags);
3744	thread_group_cputime(p, &totals);
3745	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3746	task_rq_unlock(rq, p, &flags);
3747
3748	return ns;
3749	}
3750
3751	/*
3752	* Account user cpu time to a process.	3728	* Account user cpu time to a process.
3753	* @p: the process that the cpu time gets accounted to	3729	* @p: the process that the cpu time gets accounted to
3754	* @cputime: the cpu time spent in user space since the last update	3730	* @cputime: the cpu time spent in user space since the last update