aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-09-01 06:42:04 -0400
committerGreg Kroah-Hartman <gregkh@suse.de>2011-10-16 17:14:51 -0400
commit249cf808ba1a0d403fe7c476a74b66e2bc0a8e53 (patch)
treee090348cc2bc2a3a1a226f35e6c6f99f76c1bebb
parent3217df8e225c8579293fd2e193ba5cee709b6eba (diff)
posix-cpu-timers: Cure SMP wobbles
commit d670ec13178d0fd8680e6742a2bc6e04f28f87d8 upstream. David reported: Attached below is a watered-down version of rt/tst-cpuclock2.c from GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or similar. Run it several times, and you will see cases where the main thread will measure a process clock difference before and after the nanosleep which is smaller than the cpu-burner thread's individual thread clock difference. This doesn't make any sense since the cpu-burner thread is part of the top-level process's thread group. I've reproduced this on both x86-64 and sparc64 (using both 32-bit and 64-bit binaries). For example: [davem@boricha build-x86_64-linux]$ ./test process: before(0.001221967) after(0.498624371) diff(497402404) thread: before(0.000081692) after(0.498316431) diff(498234739) self: before(0.001223521) after(0.001240219) diff(16698) [davem@boricha build-x86_64-linux]$ The diff of 'process' should always be >= the diff of 'thread'. I make sure to wrap the 'thread' clock measurements the most tightly around the nanosleep() call, and that the 'process' clock measurements are the outer-most ones. --- #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <fcntl.h> #include <string.h> #include <errno.h> #include <pthread.h> static pthread_barrier_t barrier; static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) __asm__ __volatile__("" : : : "memory"); return NULL; } int main(void) { clockid_t process_clock, my_thread_clock, th_clock; struct timespec process_before, process_after; struct timespec me_before, me_after; struct timespec th_before, th_after; struct timespec sleeptime; unsigned long diff; pthread_t th; int err; err = clock_getcpuclockid(0, &process_clock); if (err) return 1; err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); if (err) return 1; pthread_barrier_init(&barrier, NULL, 2); err = pthread_create(&th, NULL, chew_cpu, NULL); if (err) return 1; err = pthread_getcpuclockid(th, &th_clock); if (err) return 1; pthread_barrier_wait(&barrier); err = clock_gettime(process_clock, &process_before); if (err) return 1; err = clock_gettime(my_thread_clock, &me_before); if (err) return 1; err = clock_gettime(th_clock, &th_before); if (err) return 1; sleeptime.tv_sec = 0; sleeptime.tv_nsec = 500000000; nanosleep(&sleeptime, NULL); err = clock_gettime(th_clock, &th_after); if (err) return 1; err = clock_gettime(my_thread_clock, &me_after); if (err) return 1; err = clock_gettime(process_clock, &process_after); if (err) return 1; diff = process_after.tv_nsec - process_before.tv_nsec; printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", process_before.tv_sec, process_before.tv_nsec, process_after.tv_sec, process_after.tv_nsec, diff); diff = th_after.tv_nsec - th_before.tv_nsec; printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", th_before.tv_sec, th_before.tv_nsec, th_after.tv_sec, th_after.tv_nsec, diff); diff = me_after.tv_nsec - me_before.tv_nsec; printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", me_before.tv_sec, me_before.tv_nsec, me_after.tv_sec, me_after.tv_nsec, diff); return 0; } This is due to us using p->se.sum_exec_runtime in thread_group_cputime() where we iterate the thread group and sum all data. This does not take time since the last schedule operation (tick or otherwise) into account. We can cure this by using task_sched_runtime() at the cost of having to take locks. This also means we can (and must) do away with thread_group_sched_runtime() since the modified thread_group_cputime() is now more accurate and would deadlock when called from thread_group_sched_runtime(). Aside of that it makes the function safe on 32 bit systems. The old code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a 64bit value and could be changed on another cpu at the same time. Reported-by: David Miller <davem@davemloft.net> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins Tested-by: David Miller <davem@davemloft.net> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/sched.c24
3 files changed, 3 insertions, 27 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14a6c7b545d..4ef452b93f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1937,7 +1937,6 @@ static inline void disable_sched_clock_irqtime(void) {}
1937 1937
1938extern unsigned long long 1938extern unsigned long long
1939task_sched_runtime(struct task_struct *task); 1939task_sched_runtime(struct task_struct *task);
1940extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
1941 1940
1942/* sched_exec is called by processes performing an exec */ 1941/* sched_exec is called by processes performing an exec */
1943#ifdef CONFIG_SMP 1942#ifdef CONFIG_SMP
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e..c8008dd58ef 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
250 do { 250 do {
251 times->utime = cputime_add(times->utime, t->utime); 251 times->utime = cputime_add(times->utime, t->utime);
252 times->stime = cputime_add(times->stime, t->stime); 252 times->stime = cputime_add(times->stime, t->stime);
253 times->sum_exec_runtime += t->se.sum_exec_runtime; 253 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 254 } while_each_thread(tsk, t);
255out: 255out:
256 rcu_read_unlock(); 256 rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
312 cpu->cpu = cputime.utime; 312 cpu->cpu = cputime.utime;
313 break; 313 break;
314 case CPUCLOCK_SCHED: 314 case CPUCLOCK_SCHED:
315 cpu->sched = thread_group_sched_runtime(p); 315 thread_group_cputime(p, &cputime);
316 cpu->sched = cputime.sum_exec_runtime;
316 break; 317 break;
317 } 318 }
318 return 0; 319 return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6202e924c96..063d7a496f4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3713,30 +3713,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3713} 3713}
3714 3714
3715/* 3715/*
3716 * Return sum_exec_runtime for the thread group.
3717 * In case the task is currently running, return the sum plus current's
3718 * pending runtime that have not been accounted yet.
3719 *
3720 * Note that the thread group might have other running tasks as well,
3721 * so the return value not includes other pending runtime that other
3722 * running tasks might have.
3723 */
3724unsigned long long thread_group_sched_runtime(struct task_struct *p)
3725{
3726 struct task_cputime totals;
3727 unsigned long flags;
3728 struct rq *rq;
3729 u64 ns;
3730
3731 rq = task_rq_lock(p, &flags);
3732 thread_group_cputime(p, &totals);
3733 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3734 task_rq_unlock(rq, p, &flags);
3735
3736 return ns;
3737}
3738
3739/*
3740 * Account user cpu time to a process. 3716 * Account user cpu time to a process.
3741 * @p: the process that the cpu time gets accounted to 3717 * @p: the process that the cpu time gets accounted to
3742 * @cputime: the cpu time spent in user space since the last update 3718 * @cputime: the cpu time spent in user space since the last update