From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8ec9d13140be..32e6ede85255 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Revert to default priority/policy on fork if requested. Make sure we + * do not leak PI boosting priority to the child. */ - p->prio = current->normal_prio; + if (current->sched_reset_on_fork && + (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) + p->policy = SCHED_NORMAL; + + if (current->sched_reset_on_fork && + (current->normal_prio < DEFAULT_PRIO)) + p->prio = DEFAULT_PRIO; + else + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6148,6 +6172,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6191,6 +6219,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ -- cgit v1.2.2 From b9dc29e72fd3dc2a739ce8eafd958220d0745734 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:46:01 +0200 Subject: sched: Clean up SCHED_RESET_ON_FORK Make SCHED_RESET_ON_FORK sched_fork() bits a self-contained unlikely code path. Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228361.18329.6.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 32e6ede85255..50e4e3d15e83 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,28 +2613,30 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Revert to default priority/policy on fork if requested. Make sure we - * do not leak PI boosting priority to the child. + * Make sure we do not leak PI boosting priority to the child. */ - if (current->sched_reset_on_fork && - (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) - p->policy = SCHED_NORMAL; + p->prio = current->normal_prio; - if (current->sched_reset_on_fork && - (current->normal_prio < DEFAULT_PRIO)) - p->prio = DEFAULT_PRIO; - else - p->prio = current->normal_prio; + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); -- cgit v1.2.2 From 6c697bdf08a09ce461e305a22362973036e95db3 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:48:02 +0200 Subject: sched: Add SCHED_RESET_ON_FORK functionality for nice < 0 tasks Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228482.27326.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 50e4e3d15e83..34f94240642f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2627,6 +2627,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->normal_prio < DEFAULT_PRIO) p->prio = DEFAULT_PRIO; + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: -- cgit v1.2.2 From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:47 +0900 Subject: percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo Cc: Mike Frysinger Cc: Tony Luck Cc: Andy Grover --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..34fd81d21784 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -318,12 +318,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -- cgit v1.2.2 From 54d35f29f49224d86b994acb6e5969b9ba09022d Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 29 Jun 2009 14:44:57 +0900 Subject: sched: Hide runqueues from direct reference at source code level for __raw_get_cpu_var() Hide __raw_get_cpu_var() as well - thus all the direct references to runqueues will abstracted out. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090629.144457.886429910353660979.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 168b2680ae27..ebc5151f88c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -692,6 +692,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -6669,7 +6670,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); @@ -6681,7 +6682,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); -- cgit v1.2.2 From 03b042bf1dc14a268a3d65d38b4ec2a4261e8477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:16 -0700 Subject: rcu: Add synchronize_sched_expedited() primitive This adds the synchronize_sched_expedited() primitive that implements the "big hammer" expedited RCU grace periods. This primitive is placed in kernel/sched.c rather than kernel/rcupdate.c due to its need to interact closely with the migration_thread() kthread. The idea is to wake up this kthread with req->task set to NULL, in response to which the kthread reports the quiescent state resulting from the kthread having been scheduled. Because this patch needs to fallback to the slow versions of the primitives in response to some races with CPU onlining and offlining, a new synchronize_rcu_bh() primitive is added as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460982947-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..9ae80bec1c1e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7024,6 +7024,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -7031,6 +7036,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -7065,8 +7071,17 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); @@ -10554,3 +10569,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ -- cgit v1.2.2 From e7aaaa6934636d7a6cadd9e2a05250fbb6a34f65 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Drop the need_resched() loop from cond_resched() The schedule() function is a loop that reschedules the current task while the TIF_NEED_RESCHED flag is set: void schedule(void) { need_resched: /* schedule code */ if (need_resched()) goto need_resched; } And cond_resched() repeat this loop: do { add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); } while(need_resched()); This loop is needless because schedule() already did the check and nothing can set TIF_NEED_RESCHED between schedule() exit and the loop check in need_resched(). Then remove this needless loop. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 03f7e3fd80b5..4c5ee843d57f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6618,11 +6618,9 @@ static void __cond_resched(void) * PREEMPT_ACTIVE, which could trigger a second * cond_resched() call. */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) -- cgit v1.2.2 From 4b2155678d7cc7b5f45d6b36049091376c3408a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Remove obsolete comment in __cond_resched() Remove the outdated comment from __cond_resched() related to the now removed Big Kernel Semaphore. Reported-by: Arnd Bergmann Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4c5ee843d57f..4d39e96044b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6613,11 +6613,6 @@ static void __cond_resched(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); #endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.2 From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for __might_sleep() Cover the off case for __might_sleep(), so that we avoid #ifdefs in files that make use of it. Especially, this prepares for the __might_sleep() pull up on cond_resched(). Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4d39e96044b9..370a6c31c5e1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,9 +6610,8 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); -#endif + add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.2 From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Add a preempt count base offset to __might_sleep() Add a preempt count base offset to compare against the current preempt level count. It prepares to pull up the might_sleep check from cond_resched() to cond_resched_lock() and cond_resched_bh(). For these two helpers, we need to respectively ensure that once we'll unlock the given spinlock / reenable local softirqs, we will reach a sleepable state. Signed-off-by: Frederic Weisbecker [ Move and rename preempt_count_equals() ] Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 370a6c31c5e1..3ff4d004bd95 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,7 +6610,7 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__); + __might_sleep(__FILE__, __LINE__, 0); add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -9429,13 +9429,20 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; -- cgit v1.2.2 From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Pull up the might_sleep() check into cond_resched() might_sleep() is called late-ish in cond_resched(), after the need_resched()/preempt enabled/system running tests are checked. It's better to check the sleeps while atomic earlier and not depend on some environment datas that reduce the chances to detect a problem. Also define cond_resched_*() helpers as macros, so that the FILE/LINE reported in the sleeping while atomic warning displays the real origin and not sched.h Changes in v2: - Call __might_sleep() directly instead of might_sleep() which may call cond_resched() - Turn cond_resched() into a macro so that the file:line couple reported refers to the caller of cond_resched() and not __cond_resched() itself. Changes in v3: - Also propagate this __might_sleep() pull up to cond_resched_lock() and cond_resched_softirq() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3ff4d004bd95..1f7919add8ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,6 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__, 0); - add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); @@ -6628,14 +6626,14 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; @@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. -- cgit v1.2.2 From a004cd42181409eda70804ded240a791f4564d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 09:54:05 +0200 Subject: sched: Fix return value of migration_init() migration_init() returns the return value of the hotplug notifier. In the success case this is NOTIFY_OK which is 1. initcall_debug evaluates that as an error code because init calls are expected to return 0 on success. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1f7919add8ae..953f037dc053 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7654,7 +7654,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif -- cgit v1.2.2 From a5004278f0525dcb9aa43703ef77bf371ea837cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jul 2009 14:04:49 +0200 Subject: sched: Fix cgroup smp fairness Commit ec4e0e2fe018992d980910db901637c814575914 ("fix inconsistency when redistribute per-cpu tg->cfs_rq shares") broke cgroup smp fairness. In order to avoid starvation of newly placed tasks, we never quite set the share of an empty cpu group-task to 0, but instead we set it as if there's a single NICE-0 task present. If however we actually set this in cfs_rq[cpu]->shares, that means the total shares for that group will be slightly inflated every time we balance, causing the observed unfairness. Fix this by setting cfs_rq[cpu]->shares to 0 but actually setting the effective weight of the related se to the inflated number. Signed-off-by: Peter Zijlstra LKML-Reference: <1248696557.6987.1615.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ce1056e9b02a..26976cd8be0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1523,13 +1523,18 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - unsigned long shares; unsigned long rq_weight; + unsigned long shares; + int boost = 0; if (!tg->se[cpu]) return; rq_weight = tg->cfs_rq[cpu]->rq_weight; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* * \Sum shares * rq_weight @@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; + unsigned long weight, rq_weight = 0, eff_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; @@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data) * run here it will not get delayed by group starvation. */ weight = tg->cfs_rq[i]->load.weight; + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; + eff_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + for_each_cpu(i, sched_domain_span(sd)) { + unsigned long sd_rq_weight = rq_weight; + + if (!tg->cfs_rq[i]->rq_weight) + sd_rq_weight = eff_weight; + + update_group_shares_cpu(tg, i, shares, sd_rq_weight); + } return 0; } -- cgit v1.2.2 From e709715915d69b6a929d77e7652c9c3fea61c317 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 15:41:20 +0200 Subject: sched: Optimize unused cgroup configuration When cgroup group scheduling is built in, skip some code paths if we don't have any (but the root) cgroups configured. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 26976cd8be0f..ca1f76ba7773 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1629,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1640,6 +1646,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1647,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.2 From da19ab510343c6496fe8b8f890091296032025c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:22 -0400 Subject: sched: Check for pushing rt tasks after all scheduling The current method for pushing RT tasks after scheduling only happens after a context switch. But we found cases where a task is set up on a run queue to be pushed but the push never happens because the schedule chooses the same task. This bug was found with the help of Gregory Haskins and the use of ftrace (trace_printk). It tooks several days for both of us analyzing the code and the trace output to find this. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.205923666@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ca1f76ba7773..a030d4514cdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2839,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static int finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP int post_schedule = 0; +#ifdef CONFIG_SMP if (current->sched_class->needs_post_schedule) post_schedule = current->sched_class->needs_post_schedule(rq); #endif @@ -2868,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2884,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + return post_schedule; } /** @@ -2894,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); + int post_schedule; + + post_schedule = finish_task_switch(rq, prev); + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif - finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2908,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +static inline int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2955,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); + return finish_task_switch(this_rq(), prev); } /* @@ -5366,6 +5371,7 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; + int post_schedule = 0; struct rq *rq; int cpu; @@ -5416,15 +5422,25 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { +#ifdef CONFIG_SMP + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif spin_unlock_irq(&rq->lock); + } + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -- cgit v1.2.2 From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 29 Jul 2009 11:08:47 -0400 Subject: sched: Enhance the pre/post scheduling logic We currently have an explicit "needs_post" vtable method which returns a stack variable for whether we should later run post-schedule. This leads to an awkward exchange of the variable as it bubbles back up out of the context switch. Peter Zijlstra observed that this information could be stored in the run-queue itself instead of handled on the stack. Therefore, we revert to the method of having context_switch return void, and update an internal rq->post_schedule variable when we require further processing. In addition, we fix a race condition where we try to access current->sched_class without holding the rq->lock. This is technically racy, as the sched-class could change out from under us. Instead, we reference the per-rq post_schedule variable with the runqueue unlocked, but with preemption disabled to see if we need to reacquire the rq->lock. Finally, we clean the code up slightly by removing the #ifdef CONFIG_SMP conditionals from the schedule() call, and implement some inline helper functions instead. This patch passes checkpatch, and rt-migrate. Signed-off-by: Gregory Haskins Signed-off-by: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 82 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 32 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a030d4514cdc..613fee54fc89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -616,6 +616,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -2839,17 +2840,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static int finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; - int post_schedule = 0; - -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2880,10 +2875,44 @@ static int finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else - return post_schedule; +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ } +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2892,14 +2921,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - int post_schedule; - post_schedule = finish_task_switch(rq, prev); + finish_task_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ @@ -2913,7 +2942,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline int +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2960,7 +2989,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - return finish_task_switch(this_rq(), prev); + finish_task_switch(this_rq(), prev); } /* @@ -5371,7 +5400,6 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; - int post_schedule = 0; struct rq *rq; int cpu; @@ -5403,10 +5431,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5422,25 +5447,17 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else { -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif + } else spin_unlock_irq(&rq->lock); - } -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -9403,6 +9420,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; -- cgit v1.2.2 From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:23 -0400 Subject: sched: Fully integrate cpus_active_map and root-domain code Reflect "active" cpus in the rq->rd->online field, instead of the online_map. The motivation is that things that use the root-domain code (such as cpupri) only care about cpus classified as "active" anyway. By synchronizing the root-domain state with the active map, we allow several optimizations. For instance, we can remove an extra cpumask_and from the scheduler hotpath by utilizing rq->rd->online (since it is now a cached version of cpu_active_map & rq->rd->span). Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Acked-by: Max Krasnyansky Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 613fee54fc89..475138c42548 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7927,7 +7927,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.2 From 693525e3bea25cf2ee6cf2b862ba7c148e891df2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:56:38 +0200 Subject: sched: Ensure the migration task doesn't go away during use Like sched_migrate_task(), set_cpus_allowed_ptr() should hold onto the migration thread too. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 475138c42548..7f83be35d65c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7078,8 +7078,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; -- cgit v1.2.2 From f607c6685774811b8112e124f10a053d77015485 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 19:16:29 +0200 Subject: lockdep: Introduce lockdep_assert_held() Add a lockdep helper to validate that we indeed are the owner of a lock. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..2c75f7daa439 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6609,6 +6609,8 @@ int cond_resched_lock(spinlock_t *lock) int resched = should_resched(); int ret = 0; + lockdep_assert_held(lock); + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) -- cgit v1.2.2 From 49a02c514d967921a908ac64e9c0ec0f0fc17fd8 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:51:52 +0200 Subject: sched: Use structure to store local data in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105152.GB29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 165 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 76 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..565ff775fcda 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8091,6 +8091,22 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + /* * SMT sched-domains: */ @@ -8385,54 +8401,49 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + struct s_data d; int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; #ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + d.sd_allnodes = 0; + if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) goto free_covered; #endif - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) goto free_send_covered; #ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { + d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), + GFP_KERNEL); + if (!d.sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); goto free_tmpmask; } #endif - rd = alloc_rootdomain(); - if (!rd) { + d.rd = alloc_rootdomain(); + if (!d.rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); goto free_sched_groups; } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; #endif /* @@ -8441,18 +8452,20 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); + cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), + cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, + d.tmpmask); p = sd; - sd_allnodes = 1; + d.sd_allnodes = 1; } else p = NULL; @@ -8471,11 +8484,11 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); + cpumask_copy(sched_domain_span(sd), d.nodemask); sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); #ifdef CONFIG_SCHED_MC p = sd; @@ -8486,7 +8499,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); #endif #ifdef CONFIG_SCHED_SMT @@ -8498,54 +8511,54 @@ static int __build_sched_domains(const struct cpumask *cpu_map, topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, + cpumask_and(d.this_sibling_map, topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(this_sibling_map)) + if (i != cpumask_first(d.this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, + init_sched_build_groups(d.this_sibling_map, cpu_map, &cpu_to_cpu_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) + cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(d.this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, + init_sched_build_groups(d.this_core_map, cpu_map, &cpu_to_core_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, + init_sched_build_groups(d.nodemask, cpu_map, &cpu_to_phys_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) { + if (d.sd_allnodes) { init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } for (i = 0; i < nr_node_ids; i++) { @@ -8553,15 +8566,15 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; + cpumask_clear(d.covered); + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) { + d.sched_group_nodes[i] = NULL; continue; } - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); + sched_domain_node_span(i, d.domainspan); + cpumask_and(d.domainspan, d.domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -8570,30 +8583,30 @@ static int __build_sched_domains(const struct cpumask *cpu_map, "node %d\n", i); goto error; } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { + d.sched_group_nodes[i] = sg; + for_each_cpu(j, d.nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); + cpumask_copy(sched_group_cpus(sg), d.nodemask); sg->next = sg; - cpumask_or(covered, covered, nodemask); + cpumask_or(d.covered, d.covered, d.nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) + cpumask_complement(d.notcovered, d.covered); + cpumask_and(d.tmpmask, d.notcovered, cpu_map); + cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); + if (cpumask_empty(d.tmpmask)) break; - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) + cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d.tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -8605,9 +8618,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, goto error; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); + cpumask_copy(sched_group_cpus(sg), d.tmpmask); sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); + cpumask_or(d.covered, d.covered, d.tmpmask); prev->next = sg; prev = sg; } @@ -8638,13 +8651,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + init_numa_sched_groups_power(d.sched_group_nodes[i]); - if (sd_allnodes) { + if (d.sd_allnodes) { struct sched_group *sg; cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); + d.tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -8659,42 +8672,42 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #else sd = &per_cpu(phys_domains, i).sd; #endif - cpu_attach_domain(sd, rd, i); + cpu_attach_domain(sd, d.rd, i); } err = 0; free_tmpmask: - free_cpumask_var(tmpmask); + free_cpumask_var(d.tmpmask); free_send_covered: - free_cpumask_var(send_covered); + free_cpumask_var(d.send_covered); free_this_core_map: - free_cpumask_var(this_core_map); + free_cpumask_var(d.this_core_map); free_this_sibling_map: - free_cpumask_var(this_sibling_map); + free_cpumask_var(d.this_sibling_map); free_nodemask: - free_cpumask_var(nodemask); + free_cpumask_var(d.nodemask); free_notcovered: #ifdef CONFIG_NUMA - free_cpumask_var(notcovered); + free_cpumask_var(d.notcovered); free_covered: - free_cpumask_var(covered); + free_cpumask_var(d.covered); free_domainspan: - free_cpumask_var(domainspan); + free_cpumask_var(d.domainspan); out: #endif return err; free_sched_groups: #ifdef CONFIG_NUMA - kfree(sched_group_nodes); + kfree(d.sched_group_nodes); #endif goto free_tmpmask; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); + free_sched_groups(cpu_map, d.tmpmask); + free_rootdomain(d.rd); goto free_tmpmask; #endif } -- cgit v1.2.2 From 2109b99ee192764b407dc7f52babb74740eea6f9 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:53:00 +0200 Subject: sched: Separate out allocation/free/goto-hell from __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105300.GC29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 171 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 99 insertions(+), 72 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 565ff775fcda..c5d1fee42360 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8107,6 +8107,23 @@ struct s_data { struct root_domain *rd; }; +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -8394,6 +8411,77 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_sched_groups: + free_sched_groups(cpu_map, d->tmpmask); /* fall through */ + d->sched_group_nodes = NULL; + case sa_rootdomain: + free_rootdomain(d->rd); /* fall through */ + case sa_tmpmask: + free_cpumask_var(d->tmpmask); /* fall through */ + case sa_send_covered: + free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_core_map: + free_cpumask_var(d->this_core_map); /* fall through */ + case sa_this_sibling_map: + free_cpumask_var(d->this_sibling_map); /* fall through */ + case sa_nodemask: + free_cpumask_var(d->nodemask); /* fall through */ + case sa_sched_group_nodes: +#ifdef CONFIG_NUMA + kfree(d->sched_group_nodes); /* fall through */ + case sa_notcovered: + free_cpumask_var(d->notcovered); /* fall through */ + case sa_covered: + free_cpumask_var(d->covered); /* fall through */ + case sa_domainspan: + free_cpumask_var(d->domainspan); /* fall through */ +#endif + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ +#ifdef CONFIG_NUMA + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) + return sa_none; + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) + return sa_domainspan; + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) + return sa_covered; + /* Allocate the per-node list of sched groups */ + d->sched_group_nodes = kcalloc(nr_node_ids, + sizeof(struct sched_group *), GFP_KERNEL); + if (!d->sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return sa_notcovered; + } + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) + return sa_sched_group_nodes; + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) + return sa_nodemask; + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) + return sa_this_sibling_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_core_map; + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) + return sa_send_covered; + d->rd = alloc_rootdomain(); + if (!d->rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return sa_tmpmask; + } + return sa_rootdomain; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8401,50 +8489,17 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + enum s_alloc alloc_state = sa_none; struct s_data d; - int i, err = -ENOMEM; + int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; - if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) - goto free_covered; -#endif - - if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) - goto free_send_covered; - -#ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!d.sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } #endif - d.rd = alloc_rootdomain(); - if (!d.rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; - } - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; -#endif + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. @@ -8675,41 +8730,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_attach_domain(sd, d.rd, i); } - err = 0; - -free_tmpmask: - free_cpumask_var(d.tmpmask); -free_send_covered: - free_cpumask_var(d.send_covered); -free_this_core_map: - free_cpumask_var(d.this_core_map); -free_this_sibling_map: - free_cpumask_var(d.this_sibling_map); -free_nodemask: - free_cpumask_var(d.nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(d.notcovered); -free_covered: - free_cpumask_var(d.covered); -free_domainspan: - free_cpumask_var(d.domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(d.sched_group_nodes); -#endif - goto free_tmpmask; + d.sched_group_nodes = NULL; /* don't free this we still need it */ + __free_domain_allocs(&d, sa_tmpmask, cpu_map); + return 0; -#ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, d.tmpmask); - free_rootdomain(d.rd); - goto free_tmpmask; -#endif + __free_domain_allocs(&d, alloc_state, cpu_map); + return -ENOMEM; } static int build_sched_domains(const struct cpumask *cpu_map) -- cgit v1.2.2 From 7f4588f3aa395632fec9ba2e15a1920f0682fda0 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:06 +0200 Subject: sched: Separate out build of NUMA sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105406.GD29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c5d1fee42360..dd95a4708370 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8482,6 +8482,37 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_rootdomain; } +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA + struct sched_domain *parent; + + d->sd_allnodes = 0; + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); + d->sd_allnodes = 1; + } + parent = sd; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = parent; + if (parent) + parent->child = sd; + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8510,31 +8541,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, - d.tmpmask); - p = sd; - d.sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); -#endif - + sd = __build_numa_sched_domains(&d, cpu_map, attr, i); p = sd; sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); -- cgit v1.2.2 From 87cce6622c2ab2f0e96ecc2a37133378a7db3177 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:55 +0200 Subject: sched: Separate out build of CPU sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105455.GE29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dd95a4708370..3d0666c5e026 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8513,6 +8513,22 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, return sd; } +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), d->nodemask); + sd->parent = parent; + if (parent) + parent->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8542,15 +8558,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_map); sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d.nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); + sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_MC p = sd; -- cgit v1.2.2 From 410c408108bb85f32fe132aaf448388af0b6da64 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:56:14 +0200 Subject: sched: Separate out build of MC sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105614.GF29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3d0666c5e026..5c829d4ba8f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8529,6 +8529,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8559,18 +8576,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_SMT p = sd; -- cgit v1.2.2 From d81735355533cd4b2bce9508d86fcad24a38cf47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:03 +0200 Subject: sched: Separate out build of SMT sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105703.GG29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5c829d4ba8f1..2ecec06e3f0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8546,6 +8546,23 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8569,7 +8586,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; + struct sched_domain *sd; cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8577,18 +8594,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - topology_thread_cpumask(i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } #ifdef CONFIG_SCHED_SMT -- cgit v1.2.2 From 0e8e85c941d8f1b43bcc2e3b8b7026cdae476c53 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:51 +0200 Subject: sched: Separate out build of SMT sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105751.GH29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2ecec06e3f0c..43cfc6e54e96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8563,6 +8563,25 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, + const struct cpumask *cpu_map, int cpu) +{ + switch (l) { +#ifdef CONFIG_SCHED_SMT + case SD_LV_SIBLING: /* set up CPU (sibling) groups */ + cpumask_and(d->this_sibling_map, cpu_map, + topology_thread_cpumask(cpu)); + if (cpu == cpumask_first(d->this_sibling_map)) + init_sched_build_groups(d->this_sibling_map, cpu_map, + &cpu_to_cpu_group, + d->send_covered, d->tmpmask); + break; +#endif + default: + break; + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8597,19 +8616,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(d.this_sibling_map, - topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(d.this_sibling_map)) - continue; - - init_sched_build_groups(d.this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d.send_covered, d.tmpmask); + build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); } -#endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ -- cgit v1.2.2 From a2af04cdbb748158043e31799b28c48272081600 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:58:38 +0200 Subject: sched: Separate out build of MC sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105838.GI29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 43cfc6e54e96..f2c202f66297 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8576,6 +8576,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_cpu_group, d->send_covered, d->tmpmask); break; +#endif +#ifdef CONFIG_SCHED_MC + case SD_LV_MC: /* set up multi-core groups */ + cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); + if (cpu == cpumask_first(d->this_core_map)) + init_sched_build_groups(d->this_core_map, cpu_map, + &cpu_to_core_group, + d->send_covered, d->tmpmask); + break; #endif default: break; @@ -8618,21 +8627,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_MC, cpu_map, i); } -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(d.this_core_map)) - continue; - - init_sched_build_groups(d.this_core_map, cpu_map, - &cpu_to_core_group, - d.send_covered, d.tmpmask); - } -#endif - /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); -- cgit v1.2.2 From 86548096f252bfe2065f1ea2d301e7319a16375d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:59:28 +0200 Subject: sched: Separate out build of CPU sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105928.GJ29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f2c202f66297..b09a41c93ae1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8586,6 +8586,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif + case SD_LV_CPU: /* set up physical groups */ + cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); + if (!cpumask_empty(d->nodemask)) + init_sched_build_groups(d->nodemask, cpu_map, + &cpu_to_phys_group, + d->send_covered, d->tmpmask); + break; default: break; } @@ -8631,15 +8638,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) - continue; - - init_sched_build_groups(d.nodemask, cpu_map, - &cpu_to_phys_group, - d.send_covered, d.tmpmask); - } + for (i = 0; i < nr_node_ids; i++) + build_sched_groups(&d, SD_LV_CPU, cpu_map, i); #ifdef CONFIG_NUMA /* Set up node groups */ -- cgit v1.2.2 From de616e36c700dc312d9021dd75f769c463f85122 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:00:13 +0200 Subject: sched: Separate out build of ALLNODES sched groups from __build_sched_domains For the sake of completeness. Now all calls to init_sched_build_groups() are contained in build_sched_groups(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110013.GK29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b09a41c93ae1..52c1953bc41d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8593,6 +8593,12 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_phys_group, d->send_covered, d->tmpmask); break; +#ifdef CONFIG_NUMA + case SD_LV_ALLNODES: + init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + d->send_covered, d->tmpmask); + break; +#endif default: break; } @@ -8643,11 +8649,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ - if (d.sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - d.send_covered, d.tmpmask); - } + if (d.sd_allnodes) + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ -- cgit v1.2.2 From 0601a88d8fa4508eaa49a6d96c6685e1dece38e3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:01:11 +0200 Subject: sched: Separate out build of NUMA sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110111.GL29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 130 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 63 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 52c1953bc41d..c1ce884a0163 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8246,6 +8246,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->nodemask); + sg->next = sg; + cpumask_or(d->covered, d->covered, d->nodemask); + + prev = sg; + for (j = 0; j < nr_node_ids; j++) { + n = (num + j) % nr_node_ids; + cpumask_complement(d->notcovered, d->covered); + cpumask_and(d->tmpmask, d->notcovered, cpu_map); + cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); + if (cpumask_empty(d->tmpmask)) + break; + cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d->tmpmask)) + continue; + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + return -ENOMEM; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->tmpmask); + sg->next = prev->next; + cpumask_or(d->covered, d->covered, d->tmpmask); + prev->next = sg; + prev = sg; + } +out: + return 0; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA @@ -8652,70 +8717,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (d.sd_allnodes) build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(d.covered); - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) { - d.sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, d.domainspan); - cpumask_and(d.domainspan, d.domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); + for (i = 0; i < nr_node_ids; i++) + if (build_numa_sched_groups(&d, cpu_map, i)) goto error; - } - d.sched_group_nodes[i] = sg; - for_each_cpu(j, d.nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.nodemask); - sg->next = sg; - cpumask_or(d.covered, d.covered, d.nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; - - cpumask_complement(d.notcovered, d.covered); - cpumask_and(d.tmpmask, d.notcovered, cpu_map); - cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); - if (cpumask_empty(d.tmpmask)) - break; - - cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d.tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.tmpmask); - sg->next = prev->next; - cpumask_or(d.covered, d.covered, d.tmpmask); - prev->next = sg; - prev = sg; - } - } #endif /* Calculate CPU power for physical packages and nodes */ -- cgit v1.2.2 From 294b0c9619a0469a3b385b6fc47e79f64222a692 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:02:29 +0200 Subject: sched: Consolidate definition of variable sd in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110229.GM29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c1ce884a0163..cf4c953d6486 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8678,6 +8678,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, { enum s_alloc alloc_state = sa_none; struct s_data d; + struct sched_domain *sd; int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; @@ -8692,8 +8693,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8725,22 +8724,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - + sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - + sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - + sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -8759,7 +8755,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) -- cgit v1.2.2 From cde7e5ca4e329a157108769d1f752d191cbb71c6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 18 Aug 2009 13:01:01 +0900 Subject: sched: Use for_each_class macro in move_one_task() Replace for loop with the macro for_each_class to cleanup. Signed-off-by: Hiroshi Shimamoto LKML-Reference: <4A8A277D.4090304@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7f83be35d65c..1b529efe8872 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3461,9 +3461,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } -- cgit v1.2.2 From a8af7246c114bfd939e539f9566b872c06f6225c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Aug 2009 13:58:54 +0200 Subject: sched: Avoid division by zero Patch a5004278f0525dcb9aa43703ef77bf371ea837cd (sched: Fix cgroup smp fairness) introduced the possibility of a divide-by-zero because load-balancing is not synchronized between sched_domains. This can cause the state of cpus to change between the first and second loop over the sched domain in tg_shares_up(). Reported-by: Yinghai Lu Signed-off-by: Peter Zijlstra Cc: Jes Sorensen Cc: Jens Axboe Cc: Linus Torvalds LKML-Reference: <1250855934.7538.30.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1b529efe8872..8f8a98eab9db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1522,7 +1522,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) + unsigned long sd_shares, unsigned long sd_rq_weight, + unsigned long sd_eff_weight) { unsigned long rq_weight; unsigned long shares; @@ -1535,13 +1536,15 @@ update_group_shares_cpu(struct task_group *tg, int cpu, if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; + if (sd_rq_weight == sd_eff_weight) + sd_eff_weight += NICE_0_LOAD; + sd_rq_weight = sd_eff_weight; } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1593,14 +1596,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) { - unsigned long sd_rq_weight = rq_weight; - - if (!tg->cfs_rq[i]->rq_weight) - sd_rq_weight = eff_weight; - - update_group_shares_cpu(tg, i, shares, sd_rq_weight); - } + for_each_cpu(i, sched_domain_span(sd)) + update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); return 0; } -- cgit v1.2.2 From d6714c22b43fbcbead7e7b706ff270e15f04a791 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:46 -0700 Subject: rcu: Renamings to increase RCU clarity Make RCU-sched, RCU-bh, and RCU-preempt be underlying implementations, with "RCU" defined in terms of one of the three. Update the outdated rcu_qsctr_inc() names, as these functions no longer increment anything. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746132696-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cda8b81f8801..c9beca67a53e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5325,7 +5325,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); + rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; -- cgit v1.2.2 From 34d76c41554a05425613d16efebb3069c4c545f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Aug 2009 13:08:56 +0200 Subject: sched: Fix division by zero - really When re-computing the shares for each task group's cpu representation we need the ratio of weight on each cpu vs the total weight of the sched domain. Since load-balancing is loosely (read not) synchronized, the weight of individual cpus can change between doing the sum and calculating the ratio. The previous patch dealt with only one of the race scenarios, this patch side steps them all by saving a snapshot of all the individual cpu weights, thereby always working on a consistent set. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: jes@sgi.com Cc: jens.axboe@oracle.com Cc: Balbir Singh Cc: Arjan van de Ven Cc: Yinghai Lu LKML-Reference: <1251371336.18584.77.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8f8a98eab9db..523e20a62695 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1515,30 +1515,29 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +struct update_shares_data { + unsigned long rq_weight[NR_CPUS]; +}; + +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight, - unsigned long sd_eff_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + struct update_shares_data *usd) { - unsigned long rq_weight; - unsigned long shares; + unsigned long shares, rq_weight; int boost = 0; - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd->rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; - if (sd_rq_weight == sd_eff_weight) - sd_eff_weight += NICE_0_LOAD; - sd_rq_weight = sd_eff_weight; } /* @@ -1555,6 +1554,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1568,25 +1568,31 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, eff_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + struct update_shares_data *usd; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd = &__get_cpu_var(update_shares_data); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd->rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; - if (!weight) weight = NICE_0_LOAD; - eff_weight += weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1597,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd); + + local_irq_restore(flags); return 0; } -- cgit v1.2.2 From 84e9dabf6e6a78928c6a1a8ba235c9fb0908d0f8 Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Fri, 28 Aug 2009 22:40:43 -0700 Subject: sched: Rename init_cfs_rq => init_tg_cfs_rq ... so that it does not share a common name with a function within the same scope. Signed-off-by: Anirban Sinha LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 523e20a62695..6244d24cafc1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -318,7 +318,7 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9400,11 +9400,11 @@ void __init sched_init(void) * system cpu resource, based on the weight assigned to root * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of + * (init_tg_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), + &per_cpu(init_tg_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1, root_task_group.se[i]); -- cgit v1.2.2 From 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 20 Jul 2009 11:26:58 -0700 Subject: sched: Provide iowait counters For counting how long an application has been waiting for (disk) IO, there currently is only the HZ sample driven information available, while for all other counters in this class, a high resolution version is available via CONFIG_SCHEDSTATS. In order to make an improved bootchart tool possible, we also need a higher resolution version of the iowait time. This patch below adds this scheduler statistic to the kernel. Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra LKML-Reference: <4A64B813.1080506@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6244d24cafc1..38d05a89e0f2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6754,7 +6754,9 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; schedule(); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -6767,7 +6769,9 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; ret = schedule_timeout(timeout); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; -- cgit v1.2.2 From f93e65c186ab3c05ce2068733ca10e34fd00125e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:32 +0200 Subject: sched: Restore __cpu_power to a straight sum of power cpu_power is supposed to be a representation of the process capacity of the cpu, not a value to randomly tweak in order to affect placement. Remove the placement hacks. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.810860576@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index da1edc8277d0..584a122b553c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8464,15 +8464,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_domain *child; struct sched_group *group; + long power; + int weight; WARN_ON(!sd || !sd->groups); @@ -8483,22 +8481,20 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->__cpu_power = 0; - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + if (!child) { + power = SCHED_LOAD_SCALE; + weight = cpumask_weight(sched_domain_span(sd)); + /* + * SMT siblings share the power of a single core. + */ + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) + power /= weight; + sg_inc_cpu_power(sd->groups, power); return; } /* - * add cpu_power of each child group to this groups cpu_power + * Add cpu_power of each child group to this groups cpu_power. */ group = child->groups; do { -- cgit v1.2.2 From b5d978e0c7e79a7ff842e895c85a86b38c71f1cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:33 +0200 Subject: sched: Add SD_PREFER_SIBLING Do the placement thing using SD flags. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.897028974@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 584a122b553c..9d64cec9ae1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3811,9 +3811,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { + struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; - int load_idx; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); @@ -3833,6 +3837,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->total_load += sgs.group_load; sds->total_pwr += group->__cpu_power; + /* + * In case the child domain prefers tasks go to siblings + * first, lower the group capacity to one so that we'll try + * and move all the excess tasks away. + */ + if (prefer_sibling) + sgs.group_capacity = 1; + if (local_group) { sds->this_load = sgs.avg_load; sds->this = group; -- cgit v1.2.2 From cc9fba7d7672fa3ed58d9d9ecb6c45b1351c29a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:34 +0200 Subject: sched: Update the cpu_power sum during load-balance In order to prepare for a more dynamic cpu_power, update the group sum while walking the sched domains during load-balance. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.985050292@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9d64cec9ae1d..ecb4a47d4214 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3699,6 +3699,28 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static void update_sched_power(struct sched_domain *sd) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power = sdg->__cpu_power; + + if (!child) { + /* compute cpu power for this cpu */ + return; + } + + sdg->__cpu_power = 0; + + group = child->groups; + do { + sdg->__cpu_power += group->__cpu_power; + group = group->next; + } while (group != child->groups); + + if (power != sdg->__cpu_power) + sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power); +} /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. @@ -3712,7 +3734,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) @@ -3723,8 +3746,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - if (local_group) + if (local_group) { balance_cpu = group_first_cpu(group); + if (balance_cpu == this_cpu) + update_sched_power(sd); + } /* Tally up the load of all CPUs in the group */ sum_avg_load_per_task = avg_load_per_task = 0; @@ -3828,7 +3854,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && balance && !(*balance)) @@ -3863,7 +3889,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); - } /** -- cgit v1.2.2 From a52bfd73589eaf88d9c95ad2c1de0b38a6b27972 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:35 +0200 Subject: sched: Add smt_gain The idea is that multi-threading a core yields more work capacity than a single thread, provide a way to express a static gain for threads. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.073345955@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ecb4a47d4214..55112261027b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8523,9 +8523,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) weight = cpumask_weight(sched_domain_span(sd)); /* * SMT siblings share the power of a single core. + * Usually multiple threads get a better yield out of + * that one core than a single thread would have, + * reflect that in sd->smt_gain. */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= sd->smt_gain; power /= weight; + power >>= SCHED_LOAD_SHIFT; + } sg_inc_cpu_power(sd->groups, power); return; } -- cgit v1.2.2 From ab29230e673c646292c90c8b9d378b9562145af0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:36 +0200 Subject: sched: Implement dynamic cpu_power Recompute the cpu_power for each cpu during load-balance. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.162033479@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 55112261027b..036600fd70bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3699,14 +3699,46 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static void update_sched_power(struct sched_domain *sd) +unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + unsigned long old = sdg->__cpu_power; + + /* here we could scale based on cpufreq */ + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= arch_smt_gain(sd, cpu); + power >>= SCHED_LOAD_SHIFT; + } + + /* here we could scale based on RT time */ + + if (power != old) { + sdg->__cpu_power = power; + sdg->reciprocal_cpu_power = reciprocal_value(power); + } +} + +static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power = sdg->__cpu_power; if (!child) { - /* compute cpu power for this cpu */ + update_cpu_power(sd, cpu); return; } @@ -3749,7 +3781,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (local_group) { balance_cpu = group_first_cpu(group); if (balance_cpu == this_cpu) - update_sched_power(sd); + update_group_power(sd, this_cpu); } /* Tally up the load of all CPUs in the group */ -- cgit v1.2.2 From e9e9250bc78e7f6342517214c0178a529807964b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:37 +0200 Subject: sched: Scale down cpu_power due to RT tasks Keep an average on the amount of time spend on RT tasks and use that fraction to scale down the cpu_power for regular tasks. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.287778431@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 036600fd70bb..ab532b5de40e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -627,6 +627,9 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; + + u64 rt_avg; + u64 age_stamp; #endif /* calc_load related fields */ @@ -862,6 +865,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; */ unsigned int sysctl_sched_shares_thresh = 4; +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1280,12 +1291,37 @@ void wake_up_idle_cpu(int cpu) } #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -3699,7 +3735,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3709,6 +3745,24 @@ unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + sched_avg_update(rq); + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; + + if (unlikely((s64)total < SCHED_LOAD_SCALE)) + total = SCHED_LOAD_SCALE; + + total >>= SCHED_LOAD_SHIFT; + + return div_u64(available, total); +} + static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); @@ -3719,11 +3773,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) /* here we could scale based on cpufreq */ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_smt_gain(sd, cpu); + power *= arch_scale_smt_power(sd, cpu); power >>= SCHED_LOAD_SHIFT; } - /* here we could scale based on RT time */ + power *= scale_rt_power(cpu); + power >>= SCHED_LOAD_SHIFT; + + if (!power) + power = 1; if (power != old) { sdg->__cpu_power = power; -- cgit v1.2.2 From bdb94aa5dbd8b55e75f5a50b61312fe589e2c2d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:38 +0200 Subject: sched: Try to deal with low capacity When the capacity drops low, we want to migrate load away. Allow the load-balancer to remove all tasks when we hit rock bottom. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.342231003@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ab532b5de40e..5f5b359b01b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3908,8 +3908,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; - sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE); } /** @@ -3959,7 +3959,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, * and move all the excess tasks away. */ if (prefer_sibling) - sgs.group_capacity = 1; + sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; @@ -4191,6 +4191,26 @@ ret: return NULL; } +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->__cpu_power; +} + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -4203,15 +4223,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; + wl /= power; - if (rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > imbalance) continue; if (wl > max_load) { -- cgit v1.2.2 From d899a789c28ded9c72b57ddb61868d6b8fc23e80 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 2 Sep 2009 16:59:10 +0530 Subject: sched: Try to deal with low capacity, fix update_sd_power_savings_stats() sgs.group_capacity can now be 0, if for some reason group->__cpu_power happens to be less than SCHED_LOAD_SCALE/2. In that case, we need the following fix to make it work for update_sd_power_savings_stats(). That's because both sum_nr_running and group_capacity are unsigned longs. Cc: Gautham R Shenoy Cc: Peter Zijlstra Cc: Andreas Herrmann Cc: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5f5b359b01b8..e1ebf9b00f5d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3668,7 +3668,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sgs->sum_nr_running > sgs->group_capacity - 1) + if (sgs->sum_nr_running + 1 > sgs->group_capacity) return; if (sgs->sum_nr_running > sds->leader_nr_running || -- cgit v1.2.2 From 18a3885fc1ffa92c2212ff0afdf033403d5b0fa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:39 +0200 Subject: sched: Remove reciprocal for cpu_power Its a source of fail, also, now that cpu_power is dynamical, its a waste of time. before: -0 [000] 132.877936: find_busiest_group: avg_load: 0 group_load: 8241 power: 1 after: bash-1689 [001] 137.862151: find_busiest_group: avg_load: 10636288 group_load: 10387 power: 1 [ v2: build fix from From: Andreas Herrmann ] Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.425896304@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 101 +++++++++++++++++++-------------------------------------- 1 file changed, 34 insertions(+), 67 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e1ebf9b00f5d..b53785346850 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include #include @@ -120,30 +119,8 @@ */ #define RUNTIME_INF ((u64)~0ULL) -#ifdef CONFIG_SMP - static void double_rq_lock(struct rq *rq1, struct rq *rq2); -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -2335,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -3768,7 +3744,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - unsigned long old = sdg->__cpu_power; /* here we could scale based on cpufreq */ @@ -3783,33 +3758,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; - if (power != old) { - sdg->__cpu_power = power; - sdg->reciprocal_cpu_power = reciprocal_value(power); - } + sdg->cpu_power = power; } static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power = sdg->__cpu_power; if (!child) { update_cpu_power(sd, cpu); return; } - sdg->__cpu_power = 0; + sdg->cpu_power = 0; group = child->groups; do { - sdg->__cpu_power += group->__cpu_power; + sdg->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); - - if (power != sdg->__cpu_power) - sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power); } /** @@ -3889,8 +3857,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = sg_div_cpu_power(group, - sgs->group_load * SCHED_LOAD_SCALE); + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* @@ -3902,14 +3869,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); + avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / + group->cpu_power; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; sgs->group_capacity = - DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE); + DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** @@ -3951,7 +3918,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += group->__cpu_power; + sds->total_pwr += group->cpu_power; /* * In case the child domain prefers tasks go to siblings @@ -4016,28 +3983,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->__cpu_power * + pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->__cpu_power * + pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds->busiest, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->busiest->cpu_power; if (sds->max_load > tmp) - pwr_move += sds->busiest->__cpu_power * + pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->__cpu_power < + if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds->this, - sds->max_load * sds->busiest->__cpu_power); + tmp = (sds->max_load * sds->busiest->cpu_power) / + sds->this->cpu_power; else - tmp = sg_div_cpu_power(sds->this, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds->this->__cpu_power * + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->this->cpu_power; + pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; @@ -4072,8 +4039,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, sds->max_load - sds->busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->__cpu_power, - (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + *imbalance = min(max_pull * sds->busiest->cpu_power, + (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* @@ -4208,7 +4175,7 @@ static unsigned long power_of(int cpu) if (!group) return SCHED_LOAD_SCALE; - return group->__cpu_power; + return group->cpu_power; } /* @@ -7922,7 +7889,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->__cpu_power) { + if (!group->cpu_power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -7946,9 +7913,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->__cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (__cpu_power = %d)", - group->__cpu_power); + if (group->cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; @@ -8233,7 +8200,7 @@ init_sched_build_groups(const struct cpumask *span, continue; cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; + sg->cpu_power = 0; for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -8491,7 +8458,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) continue; } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; } while (sg != group_head); @@ -8528,7 +8495,7 @@ static int build_numa_sched_groups(struct s_data *d, sd->groups = sg; } - sg->__cpu_power = 0; + sg->cpu_power = 0; cpumask_copy(sched_group_cpus(sg), d->nodemask); sg->next = sg; cpumask_or(d->covered, d->covered, d->nodemask); @@ -8551,7 +8518,7 @@ static int build_numa_sched_groups(struct s_data *d, "Can not alloc domain group for node %d\n", j); return -ENOMEM; } - sg->__cpu_power = 0; + sg->cpu_power = 0; cpumask_copy(sched_group_cpus(sg), d->tmpmask); sg->next = prev->next; cpumask_or(d->covered, d->covered, d->tmpmask); @@ -8629,7 +8596,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; - sd->groups->__cpu_power = 0; + sd->groups->cpu_power = 0; if (!child) { power = SCHED_LOAD_SCALE; @@ -8645,7 +8612,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) power /= weight; power >>= SCHED_LOAD_SHIFT; } - sg_inc_cpu_power(sd->groups, power); + sd->groups->cpu_power += power; return; } @@ -8654,7 +8621,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) */ group = child->groups; do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); + sd->groups->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); } -- cgit v1.2.2 From d7ea17a76916e456fcc78e45142c66f7fb875e3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:49:25 +0200 Subject: sched: Fix dynamic power-balancing crash This crash: [ 1774.088275] divide error: 0000 [#1] SMP [ 1774.100355] CPU 13 [ 1774.102498] Modules linked in: [ 1774.105631] Pid: 30881, comm: hackbench Not tainted 2.6.31-rc8-tip-01308-g484d664-dirty #1629 X8DTN [ 1774.114807] RIP: 0010:[] [] sched_balance_self+0x19b/0x2d4 Triggers because update_group_power() modifies the sd tree and does temporary calculations there - not considering that other CPUs could observe intermediate values, such as the zero initial value. Calculate it in a temporary variable instead. (we need no memory barrier as these are all statistical values anyway) Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20090904092742.GA11014@elte.hu> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b53785346850..796baf731976 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3765,19 +3765,22 @@ static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; + unsigned long power; if (!child) { update_cpu_power(sd, cpu); return; } - sdg->cpu_power = 0; + power = 0; group = child->groups; do { - sdg->cpu_power += group->cpu_power; + power += group->cpu_power; group = group->next; } while (group != child->groups); + + sdg->cpu_power = power; } /** -- cgit v1.2.2 From b78bb868c54bebbf8d8786a3f8320700d6d2b864 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:23:18 +0200 Subject: sched: Fix double_rq_lock() compile warning Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e27a53685ed9..17e4391ec2de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -119,8 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -1695,6 +1693,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all -- cgit v1.2.2 From f5f08f39ee4c5fd0a757d25d9e04d696676b3df7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:35:28 +0200 Subject: sched: Move code around In preparation to other code movement, move weighted_cpuload(), source_load() and target_load() before the class includes. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 81 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 42 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 17e4391ec2de..b56d1505d058 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1507,8 +1507,45 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - /* * Is this task likely cache-hot: */ @@ -2240,39 +2270,6 @@ void kick_process(struct task_struct *p) } EXPORT_SYMBOL_GPL(kick_process); -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. -- cgit v1.2.2 From aaee1203ca52b9db799433c33c9bffc33cdf8909 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:36:25 +0200 Subject: sched: Move sched_balance_self() into sched_fair.c Move the sched_balance_self() code into sched_fair.c This facilitates the merger of sched_balance_self() and sched_fair::select_task_rq(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 146 --------------------------------------------------------- 1 file changed, 146 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b56d1505d058..60400a22401f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2269,152 +2269,6 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - #endif /* CONFIG_SMP */ /** -- cgit v1.2.2 From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:42:00 +0200 Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq() Rather ugly patch to fully place the sched_balance_self() code inside the fair class. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 60400a22401f..32b7a81230c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2350,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; - cpu = p->sched_class->select_task_rq(p, sync); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); if (cpu != orig_cpu) { set_task_cpu(p, cpu); task_rq_unlock(rq, &flags); @@ -2525,11 +2525,6 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - /* * Make sure we do not leak PI boosting priority to the child. */ @@ -2560,6 +2555,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -3114,7 +3114,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); -- cgit v1.2.2 From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:43:03 +0200 Subject: sched: Add TASK_WAKING We're going to want to drop rq->lock in try_to_wake_up() for a longer period of time, however we also want to deal with concurrent waking of the same task, which is currently handled by holding rq->lock. So introduce a new TASK state, namely TASK_WAKING, which indicates someone is already waking the task (other wakers will fail p->state & state). We also keep preemption disabled over the whole ttwu(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 32b7a81230c2..fc6fda881d2e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2310,7 +2310,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) @@ -2332,11 +2331,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } #endif + this_cpu = get_cpu(); + smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2344,27 +2344,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + */ + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); - if (cpu != orig_cpu) { + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2422,6 +2420,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } -- cgit v1.2.2 From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +++-------------------------------------- 1 file changed, 3 insertions(+), 38 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fc6fda881d2e..6c819f338b11 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -512,14 +512,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif - this_cpu = get_cpu(); smp_wmb(); @@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } -- cgit v1.2.2 From ae154be1f34a674e6cbb43ccf6e442f56acd7a70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 14:40:57 +0200 Subject: sched: Weaken SD_POWERSAVINGS_BALANCE One of the problems of power-saving balancing is that under certain scenarios it is too slow and allows tons of real work to pile up. Avoid this by ignoring the powersave stuff when there's real work to be done. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6c819f338b11..f0ccb8b926c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1538,6 +1538,26 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -3982,26 +4002,6 @@ ret: return NULL; } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - -static unsigned long power_of(int cpu) -{ - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; -} - /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -- cgit v1.2.2 From d6a59aa3a2b1ca8411884c833a313b33b5f76e20 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 13:28:02 +0200 Subject: sched: Provide arch_scale_freq_power Provide an ach specific hook for cpufreq based scaling of cpu_power. Signed-off-by: Peter Zijlstra [ego@in.ibm.com: spotting bugs] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f0ccb8b926c8..c210321adcb9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3552,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3562,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -3586,7 +3602,8 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - /* here we could scale based on cpufreq */ + power *= arch_scale_freq_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { power *= arch_scale_smt_power(sd, cpu); -- cgit v1.2.2 From 8e6598af3f35629c37249a610cf13e73f70db279 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:20:03 +0200 Subject: sched: Feature to disable APERF/MPERF cpu_power I suspect a feed-back loop between cpuidle and the aperf/mperf cpu_power bits, where when we have idle C-states lower the ratio, which leads to lower cpu_power and then less load, which generates more idle time, etc.. Put in a knob to disable it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c210321adcb9..e8e603bf8761 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3602,11 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_scale_smt_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; } -- cgit v1.2.2 From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:55:44 +0200 Subject: sched: Rename sync arguments In order to extend the functions to have more than 1 flag (sync), rename the argument to flags, and explicitly define a WF_ space for individual flags. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e8e603bf8761..4da335cec8ee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -636,9 +636,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -2318,14 +2319,15 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; + wake_flags &= ~WF_SYNC; this_cpu = get_cpu(); @@ -2352,7 +2354,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); @@ -2378,7 +2380,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2407,7 +2409,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -5562,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, flags); } EXPORT_SYMBOL(default_wake_function); @@ -5579,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5647,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); -- cgit v1.2.2 From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 20:02:34 +0200 Subject: sched: Add WF_FORK Avoid the cache buddies from biasing the time distribution away from fork()ers. Normally the next buddy will be the preferred scheduling target, but this makes fork()s prefer to run the new child, whereas we prefer to run the parent, since that will generate more work. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4da335cec8ee..0d4c4fea3317 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2602,7 +2602,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); -- cgit v1.2.2 From 63859d4fe4c97b737e7adbfe60acb1c2b2e668cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 19:14:42 +0200 Subject: sched: Fix sync wakeups again The sync argument rename to introduce WF_* broke stuff by missing a local alias for an argument in __wake_up_common, fix it by using the more descriptive wake_flags name. This restores WF_SYNC propagation, which fixes wake_affine() behaviour, which fixes pipe-test. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0d4c4fea3317..af04ede6dd2f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5564,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int flags, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, flags); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5581,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int flags, void *key) + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, flags, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } -- cgit v1.2.2 From 3b6408942206f940dd538e980e9904e48f4b64f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:44:33 +0200 Subject: sched: Optimize cgroup vs wakeup a bit We don't need to call update_shares() for each domain we iterate, just got the largets one. However, we should call it before wake_affine() as well, so that that can use up-to-date values too. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index af04ede6dd2f..5049d959bb26 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -376,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { -- cgit v1.2.2 From eb24073bc1fe3e569a855cf38d529fb650c35524 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 21:09:13 +0200 Subject: sched: Fix TASK_WAKING & loadaverage breakage Fix this: top - 21:54:00 up 2:59, 1 user, load average: 432512.33, 426421.74, 417432.74 Which happens because we now set TASK_WAKING before activate_task(). Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5049d959bb26..969dfaef2465 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2343,7 +2343,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, /* * In order to handle concurrent wakeups and release the rq->lock * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; p->state = TASK_WAKING; task_rq_unlock(rq, &flags); -- cgit v1.2.2 From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 12:31:31 +0200 Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING Create a new wakeup preemption mode, preempt towards tasks that run shorter on avg. It sets next buddy to be sure we actually run the task we preempted for. Test results: root@twins:~# while :; do :; done & [1] 6537 root@twins:~# while :; do :; done & [2] 6538 root@twins:~# while :; do :; done & [3] 6539 root@twins:~# while :; do :; done & [4] 6540 root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 4750 usec Avg 497 usec Stdev 737 usec root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 14 usec Avg 5 usec Stdev 3 usec Disabled by default - needs more testing. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 969dfaef2465..3bb4ea2ee6f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5310,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5327,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* -- cgit v1.2.2