aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorStanislaw Gruszka <sgruszka@redhat.com>2014-11-12 10:58:44 -0500
committerIngo Molnar <mingo@kernel.org>2014-11-16 04:04:20 -0500
commit6e998916dfe327e785e7c2447959b2c1a3ea4930 (patch)
tree2f3972afec8fadbdf79aba5aaaf1fbe31b83fbeb /kernel
parent23cfa361f3e54a3e184a5e126bbbdd95f984881a (diff)
sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
Commit d670ec13178d0 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc test case in cost of breaking another one. After that commit, calling clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result of Y time being smaller than X time. Reproducer/tester can be found further below, it can be compiled and ran by: gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread while ./tst-cpuclock2 ; do : ; done This reproducer, when running on a buggy kernel, will complain about "clock_gettime difference too small". Issue happens because on start in thread_group_cputimer() we initialize sum_exec_runtime of cputimer with threads runtime not yet accounted and then add the threads runtime to running cputimer again on scheduler tick, making it's sum_exec_runtime bigger than actual threads runtime. KOSAKI Motohiro posted a fix for this problem, but that patch was never applied: https://lkml.org/lkml/2013/5/26/191 . This patch takes different approach to cure the problem. It calls update_curr() when cputimer starts, that assure we will have updated stats of running threads and on the next schedule tick we will account only the runtime that elapsed from cputimer start. That also assure we have consistent state between cpu times of individual threads and cpu time of the process consisted by those threads. Full reproducer (tst-cpuclock2.c): #define _GNU_SOURCE #include <unistd.h> #include <sys/syscall.h> #include <stdio.h> #include <time.h> #include <pthread.h> #include <stdint.h> #include <inttypes.h> /* Parameters for the Linux kernel ABI for CPU clocks. */ #define CPUCLOCK_SCHED 2 #define MAKE_PROCESS_CPUCLOCK(pid, clock) \ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) static pthread_barrier_t barrier; /* Help advance the clock. */ static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) ; return NULL; } /* Don't use the glibc wrapper. */ static int do_nanosleep(int flags, const struct timespec *req) { clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED); return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL); } static int64_t tsdiff(const struct timespec *before, const struct timespec *after) { int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec; int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec; return after_i - before_i; } int main(void) { int result = 0; pthread_t th; pthread_barrier_init(&barrier, NULL, 2); if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) { perror("pthread_create"); return 1; } pthread_barrier_wait(&barrier); /* The test. */ struct timespec before, after, sleeptimeabs; int64_t sleepdiff, diffabs; const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 }; /* The relative nanosleep. Not sure why this is needed, but its presence seems to make it easier to reproduce the problem. */ if (do_nanosleep(0, &sleeptime) != 0) { perror("clock_nanosleep"); return 1; } /* Get the current time. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) { perror("clock_gettime[2]"); return 1; } /* Compute the absolute sleep time based on the current time. */ uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec; sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000; sleeptimeabs.tv_nsec = nsec % 1000000000; /* Sleep for the computed time. */ if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) { perror("absolute clock_nanosleep"); return 1; } /* Get the time after the sleep. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) { perror("clock_gettime[3]"); return 1; } /* The time after sleep should always be equal to or after the absolute sleep time passed to clock_nanosleep. */ sleepdiff = tsdiff(&sleeptimeabs, &after); if (sleepdiff < 0) { printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff); result = 1; printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec); printf("After %llu.%09llu\n", after.tv_sec, after.tv_nsec); printf("Sleep %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec); } /* The difference between the timestamps taken before and after the clock_nanosleep call should be equal to or more than the duration of the sleep. */ diffabs = tsdiff(&before, &after); if (diffabs < sleeptime.tv_nsec) { printf("clock_gettime difference too small: %" PRId64 "\n", diffabs); result = 1; } pthread_cancel(th); return result; } Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c38
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h2
5 files changed, 24 insertions, 27 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 797a6c84c48d..24beb9bb4c3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2475,31 +2475,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2475EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2475EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2476 2476
2477/* 2477/*
2478 * Return any ns on the sched_clock that have not yet been accounted in
2479 * @p in case that task is currently running.
2480 *
2481 * Called with task_rq_lock() held on @rq.
2482 */
2483static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2484{
2485 u64 ns = 0;
2486
2487 /*
2488 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2489 * project cycles that may never be accounted to this
2490 * thread, breaking clock_gettime().
2491 */
2492 if (task_current(rq, p) && task_on_rq_queued(p)) {
2493 update_rq_clock(rq);
2494 ns = rq_clock_task(rq) - p->se.exec_start;
2495 if ((s64)ns < 0)
2496 ns = 0;
2497 }
2498
2499 return ns;
2500}
2501
2502/*
2503 * Return accounted runtime for the task. 2478 * Return accounted runtime for the task.
2504 * In case the task is currently running, return the runtime plus current's 2479 * In case the task is currently running, return the runtime plus current's
2505 * pending runtime that have not been accounted yet. 2480 * pending runtime that have not been accounted yet.
@@ -2508,7 +2483,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2508{ 2483{
2509 unsigned long flags; 2484 unsigned long flags;
2510 struct rq *rq; 2485 struct rq *rq;
2511 u64 ns = 0; 2486 u64 ns;
2512 2487
2513#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2488#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2514 /* 2489 /*
@@ -2527,7 +2502,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2527#endif 2502#endif
2528 2503
2529 rq = task_rq_lock(p, &flags); 2504 rq = task_rq_lock(p, &flags);
2530 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2505 /*
2506 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2507 * project cycles that may never be accounted to this
2508 * thread, breaking clock_gettime().
2509 */
2510 if (task_current(rq, p) && task_on_rq_queued(p)) {
2511 update_rq_clock(rq);
2512 p->sched_class->update_curr(rq);
2513 }
2514 ns = p->se.sum_exec_runtime;
2531 task_rq_unlock(rq, p, &flags); 2515 task_rq_unlock(rq, p, &flags);
2532 2516
2533 return ns; 2517 return ns;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5285332392d5..28fa9d9e9201 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1701,4 +1701,6 @@ const struct sched_class dl_sched_class = {
1701 .prio_changed = prio_changed_dl, 1701 .prio_changed = prio_changed_dl,
1702 .switched_from = switched_from_dl, 1702 .switched_from = switched_from_dl,
1703 .switched_to = switched_to_dl, 1703 .switched_to = switched_to_dl,
1704
1705 .update_curr = update_curr_dl,
1704}; 1706};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3af3d1e7df9b..ef2b104b254c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
726 account_cfs_rq_runtime(cfs_rq, delta_exec); 726 account_cfs_rq_runtime(cfs_rq, delta_exec);
727} 727}
728 728
729static void update_curr_fair(struct rq *rq)
730{
731 update_curr(cfs_rq_of(&rq->curr->se));
732}
733
729static inline void 734static inline void
730update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 735update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
731{ 736{
@@ -7956,6 +7961,8 @@ const struct sched_class fair_sched_class = {
7956 7961
7957 .get_rr_interval = get_rr_interval_fair, 7962 .get_rr_interval = get_rr_interval_fair,
7958 7963
7964 .update_curr = update_curr_fair,
7965
7959#ifdef CONFIG_FAIR_GROUP_SCHED 7966#ifdef CONFIG_FAIR_GROUP_SCHED
7960 .task_move_group = task_move_group_fair, 7967 .task_move_group = task_move_group_fair,
7961#endif 7968#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..20bca398084a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = {
2128 2128
2129 .prio_changed = prio_changed_rt, 2129 .prio_changed = prio_changed_rt,
2130 .switched_to = switched_to_rt, 2130 .switched_to = switched_to_rt,
2131
2132 .update_curr = update_curr_rt,
2131}; 2133};
2132 2134
2133#ifdef CONFIG_SCHED_DEBUG 2135#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..2df8ef067cc5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1135,6 +1135,8 @@ struct sched_class {
1135 unsigned int (*get_rr_interval) (struct rq *rq, 1135 unsigned int (*get_rr_interval) (struct rq *rq,
1136 struct task_struct *task); 1136 struct task_struct *task);
1137 1137
1138 void (*update_curr) (struct rq *rq);
1139
1138#ifdef CONFIG_FAIR_GROUP_SCHED 1140#ifdef CONFIG_FAIR_GROUP_SCHED
1139 void (*task_move_group) (struct task_struct *p, int on_rq); 1141 void (*task_move_group) (struct task_struct *p, int on_rq);
1140#endif 1142#endif