From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 4 Feb 2009 11:59:44 -0800 Subject: sched: fix nohz load balancer on cpu offline Christian Borntraeger reports: > After a logical cpu offline, even on a complete idle system, there > is one cpu with full ticks. It turns out that nohz.cpu_mask has the > the offlined cpu still set. > > In select_nohz_load_balancer() we check if the system is completely > idle to turn of load balancing. We compare cpu_online_map with > nohz.cpu_mask. Since cpu_online_map is updated on cpu unplug, > but nohz.cpu_mask is not, the check fails and the scheduler believes > that we need an "idle load balancer" even on a fully idle system. > Since the ilb cpu does not deactivate the timer tick this breaks NOHZ. Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask while a cpu is going offline. Reported-by: Christian Borntraeger Signed-off-by: Suresh Siddha Tested-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70d..e1fc67d0674c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; - /* - * If we are going offline and still the leader, give up! - */ - if (!cpu_active(cpu) && - atomic_read(&nohz.load_balancer) == cpu) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); + return 0; } + cpumask_set_cpu(cpu, nohz.cpu_mask); + /* time for ilb owner also to sleep */ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) -- cgit v1.2.2 From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/exit.c | 3 +++ kernel/fork.c | 3 ++- kernel/signal.c | 8 ++++---- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3f1875..efd30ccf3858 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e7721..e8e854a04ad2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/signal.c b/kernel/signal.c index b6b36768b758..2a74fe87c0dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; - struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - thread_group_cputime(tsk, &cputime); - info.si_utime = cputime_to_jiffies(cputime.utime); - info.si_stime = cputime_to_jiffies(cputime.stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) -- cgit v1.2.2 From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:16 +0100 Subject: timers: split process wide cpu clocks/timers Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/itimer.c | 4 +- kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched_stats.h | 45 ++++++++++++---------- 3 files changed, 119 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93dd8bd..58762f7077ec 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime cputime; cputime_t utime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); @@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime times; cputime_t ptime; - thread_group_cputime(tsk, ×); + thread_group_cputimer(tsk, ×); ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..db107c9bbc05 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct sighand_struct *sighand; + struct signal_struct *sig; + struct task_struct *t; + + *times = INIT_CPUTIME; + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (!sighand) + goto out; + + sig = tsk->signal; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += t->se.sum_exec_runtime; + + t = next_thread(t); + } while (t != tsk); + + times->utime = cputime_add(times->utime, sig->utime); + times->stime = cputime_add(times->stime, sig->stime); + times->sum_exec_runtime += sig->sum_sched_runtime; +out: + rcu_read_unlock(); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. @@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +/* + * Enable the process wide cpu timer accounting. + * + * serialized using ->sighand->siglock + */ +static void start_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 1; + barrier(); +} + +/* + * Release the process wide timer accounting -- timer stops ticking when + * nobody cares about it. + * + * serialized using ->sighand->siglock + */ +static void stop_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 0; + barrier(); +} + /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) + start_process_timers(p); + listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) + list_empty(&timers[CPUCLOCK_SCHED])) { + stop_process_timers(tsk); return; + } /* * Collect the current process totals. */ - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; @@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; - thread_group_cputime(tsk, &group_sample); + thread_group_cputimer(tsk, &group_sample); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group(clock_idx, tsk, &now); + start_process_timers(tsk); + cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef8ecab..a8f93dd374e1 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->utime = cputime_add(times->utime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->stime = cputime_add(times->stime, cputime); - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct task_cputime *times; + struct thread_group_cputimer *cputimer; struct signal_struct *sig; sig = tsk->signal; @@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - times = &sig->cputime.totals; + cputimer = &sig->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->sum_exec_runtime += ns; - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); } -- cgit v1.2.2 From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Feb 2009 16:37:31 +0100 Subject: timers: split process wide cpu clocks/timers, fix To decrease the chance of a missed enable, always enable the timer when we sample it, we'll always disable it when we find that there are no active timers in the jiffy tick. This fixes a flood of warnings reported by Mike Galbraith. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index db107c9bbc05..e5d7bfdfa7d4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct task_cputime cputime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, cputime.utime, cputime.stime, cputime.sum_exec_runtime); } @@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } -/* - * Enable the process wide cpu timer accounting. - * - * serialized using ->sighand->siglock - */ -static void start_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 1; - barrier(); -} - -/* - * Release the process wide timer accounting -- timer stops ticking when - * nobody cares about it. - * - * serialized using ->sighand->siglock - */ -static void stop_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 0; - barrier(); -} - /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); - if (!CPUCLOCK_PERTHREAD(timer->it_clock)) - start_process_timers(p); - listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk, } } +static void stop_process_timers(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + unsigned long flags; + + if (!cputimer->running) + return; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - start_process_timers(tsk); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { -- cgit v1.2.2 From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 11:30:27 +0100 Subject: timers: fix TIMER_ABSTIME for process wide cpu timers The POSIX timer interface allows for absolute time expiry values through the TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock every time we start it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e5d7bfdfa7d4..2313a4cc14ea 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -261,6 +261,40 @@ out: rcu_read_unlock(); } +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + if (!cputimer->running) { + cputimer->running = 1; + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime, &sum); + } + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. -- cgit v1.2.2