From e864c499d9e57805ae1f9e7ea404dd223759cd53 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: sched: track the next-highest priority on each runqueue We will use this later in the series to reduce the amount of rq-lock contention during a pull operation Signed-off-by: Gregory Haskins --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 756d981d91a4..7729f9a45a8b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -463,7 +463,10 @@ struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ + struct { + int curr; /* highest queued rt task prio */ + int next; /* next highest */ + } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; @@ -8169,7 +8172,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; -- cgit v1.2.2 From 7e96fa5875d4a9be18d74d3ca7b90518d05bc426 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: sched: pull only one task during NEWIDLE balancing to limit critical section git-id c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 attempted to limit newidle critical section length by stopping after at least one task was moved. Further investigation has shown that there are other paths nested further inside the algorithm which still remain that allow long latencies to occur with newidle balancing. This patch applies the same technique inside balance_tasks() to limit the duration of this optional balancing operation. Signed-off-by: Gregory Haskins CC: Nick Piggin --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7729f9a45a8b..94d9a6c5ff94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2984,6 +2984,16 @@ next: pulled++; rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible kernels + * will stop after the first task is pulled to minimize the critical + * section. + */ + if (idle == CPU_NEWLY_IDLE) + goto out; +#endif + /* * We only want to steal up to the prescribed amount of weighted load. */ @@ -3030,9 +3040,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, sd, idle, all_pinned, &this_best_prio); class = class->next; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; - +#endif } while (class && max_load_move > total_load_moved); return total_load_moved > 0; -- cgit v1.2.2 From 8f45e2b516201d1bf681e6026fa5276385def565 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:51 -0500 Subject: sched: make double-lock-balance fair double_lock balance() currently favors logically lower cpus since they often do not have to release their own lock to acquire a second lock. The result is that logically higher cpus can get starved when there is a lot of pressure on the RQs. This can result in higher latencies on higher cpu-ids. This patch makes the algorithm more fair by forcing all paths to have to release both locks before acquiring them again. Since callsites to double_lock_balance already consider it a potential preemption/reschedule point, they have the proper logic to recheck for atomicity violations. Signed-off-by: Gregory Haskins --- kernel/sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 94d9a6c5ff94..8fca364f3593 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1608,21 +1608,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_PREEMPT + /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { int ret = 0; - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -1635,6 +1656,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { -- cgit v1.2.2 From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:52 -0500 Subject: sched: add sched_class->needs_post_schedule() member We currently run class->post_schedule() outside of the rq->lock, which means that we need to test for the need to post_schedule outside of the lock to avoid a forced reacquistion. This is currently not a problem as we only look at rq->rt.overloaded. However, we want to enhance this going forward to look at more state to reduce the need to post_schedule to a bare minimum set. Therefore, we introduce a new member-func called needs_post_schedule() which tests for the post_schedule condtion without actually performing the work. Therefore it is safe to call this function before the rq->lock is released, because we are guaranteed not to drop the lock at an intermediate point (such as what post_schedule() may do). We will use this later in the series [ rostedt: removed paranoid BUG_ON ] Signed-off-by: Gregory Haskins --- kernel/sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8fca364f3593..3acbad8991a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; +#ifdef CONFIG_SMP + int post_schedule = 0; + + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif rq->prev_mm = NULL; @@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP - if (current->sched_class->post_schedule) + if (post_schedule) current->sched_class->post_schedule(rq); #endif -- cgit v1.2.2 From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3acbad8991a2..24ab80c28765 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,7 @@ struct rt_rq { #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; -- cgit v1.2.2 From 398a153b16b09a68739928d4502455db9725ac86 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 14 Jan 2009 09:10:04 -0500 Subject: sched: fix build error in kernel/sched_rt.c when RT_GROUP_SCHED && !SMP Ingo found a build error in the scheduler when RT_GROUP_SCHED was enabled, but SMP was not. This patch rearranges the code such that it is a little more streamlined and compiles under all permutations of SMP, UP and RT_GROUP_SCHED. It was boot tested on my 4-way x86_64 and it still passes preempt-test. Signed-off-by: Gregory Haskins --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dd1a1466c1e6..2b703f1fac3a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -466,7 +466,9 @@ struct rt_rq { #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP int next; /* next highest */ +#endif } highest_prio; #endif #ifdef CONFIG_SMP @@ -8267,8 +8269,10 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; #endif +#endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; -- cgit v1.2.2 From 41719b03091911028116155deddc5eedf8c45e37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 15:36:26 +0100 Subject: mutex: preemption fixes The problem is that dropping the spinlock right before schedule is a voluntary preemption point and can cause a schedule, right after which we schedule again. Fix this inefficiency by keeping preemption disabled until we schedule, do this by explicity disabling preemption and providing a schedule() variant that assumes preemption is already disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..b001c133c359 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4538,15 +4538,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: - preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -4603,7 +4601,13 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; +} +asmlinkage void __sched schedule(void) +{ +need_resched: + preempt_disable(); + __schedule(); preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; -- cgit v1.2.2 From 0d66bf6d3514b35eb6897629059443132992dbd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 14:01:47 +0100 Subject: mutex: implement adaptive spinning Change mutex contention behaviour such that it will sometimes busy wait on acquisition - moving its behaviour closer to that of spinlocks. This concept got ported to mainline from the -rt tree, where it was originally implemented for rtmutexes by Steven Rostedt, based on work by Gregory Haskins. Testing with Ingo's test-mutex application (http://lkml.org/lkml/2006/1/8/50) gave a 345% boost for VFS scalability on my testbox: # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 296604 # ./test-mutex-shm V 16 10 | grep "^avg ops" avg ops/sec: 85870 The key criteria for the busy wait is that the lock owner has to be running on a (different) cpu. The idea is that as long as the owner is running, there is a fair chance it'll release the lock soon, and thus we'll be better off spinning instead of blocking/scheduling. Since regular mutexes (as opposed to rtmutexes) do not atomically track the owner, we add the owner in a non-atomic fashion and deal with the races in the slowpath. Furthermore, to ease the testing of the performance impact of this new code, there is means to disable this behaviour runtime (without having to reboot the system), when scheduler debugging is enabled (CONFIG_SCHED_DEBUG=y), by issuing the following command: # echo NO_OWNER_SPIN > /debug/sched_features This command re-enables spinning again (this is also the default): # echo OWNER_SPIN > /debug/sched_features Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b001c133c359..589e7308c615 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4614,6 +4614,67 @@ need_resched: } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_SMP +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) +{ + unsigned int cpu; + struct rq *rq; + + if (!sched_feat(OWNER_SPIN)) + return 0; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * Need to access the cpu field knowing that + * DEBUG_PAGEALLOC could have unmapped it if + * the mutex owner just released it and exited. + */ + if (probe_kernel_address(&owner->cpu, cpu)) + goto out; +#else + cpu = owner->cpu; +#endif + + /* + * Even if the access succeeded (likely case), + * the cpu field may no longer be valid. + */ + if (cpu >= nr_cpumask_bits) + goto out; + + /* + * We need to validate that we can do a + * get_cpu() and that we have the percpu area. + */ + if (!cpu_online(cpu)) + goto out; + + rq = cpu_rq(cpu); + + for (;;) { + /* + * Owner changed, break to re-assess state. + */ + if (lock->owner != owner) + break; + + /* + * Is that owner really running on that cpu? + */ + if (task_thread_info(rq->curr) != owner || need_resched()) + return 0; + + cpu_relax(); + } +out: + return 1; +} +#endif + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption -- cgit v1.2.2 From 831451ac4e44d3a20b581ce726ef1d1144373f7d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 12:39:18 +0100 Subject: sched: introduce avg_wakeup Introduce a new avg_wakeup statistic. avg_wakeup is a measure of how frequently a task wakes up other tasks, it represents the average time between wakeups, with a limit of avg_runtime for when it doesn't wake up anybody. Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..86f5a063f0b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1705,6 +1705,9 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; @@ -1712,10 +1715,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; + if (sleep) { + if (p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } else { + update_avg(&p->se.avg_wakeup, + sysctl_sched_wakeup_granularity); + } } sched_info_dequeued(p); @@ -2345,6 +2353,22 @@ out_activate: activate_task(rq, p, 1); success = 1; + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } + out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); @@ -2355,8 +2379,6 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: - current->se.last_wakeup = current->se.sum_exec_runtime; - task_rq_unlock(rq, &flags); return success; @@ -2386,6 +2408,8 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; + p->se.start_runtime = 0; + p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; -- cgit v1.2.2 From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 19:01:40 -0500 Subject: trace, lockdep: manual preempt count adding for local_bh_disable Impact: fix to preempt trace triggering lockdep check_flag failure In local_bh_disable, the use of add_preempt_count causes the preempt tracer to start recording the time preemption is off. But because it already modified the preempt_count to show softirqs disabled, and before it called the lockdep code to handle this, it causes a state that lockdep can not handle. The preempt tracer will reset the ring buffer on start of a trace, and the ring buffer reset code does a spin_lock_irqsave. This calls into lockdep and lockdep will fail when it detects the invalid state of having softirqs disabled but the internal current->softirqs_enabled is still set. The fix is to manually add the SOFTIRQ_OFFSET to preempt count and call the preempt tracer code outside the lockdep critical area. Thanks to Peter Zijlstra for suggesting this solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a8..c154825ae753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4399,10 +4399,7 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT -- cgit v1.2.2 From a0a522ce3d6d8c907e45d4f2730ee8573484cc88 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Fri, 13 Feb 2009 20:35:45 +0100 Subject: sched: idle_at_tick is only used when CONFIG_SMP is set Impact: struct rq size optimization The idle_at_tick in struct rq is only used in SMP settings and it does not make sense to have this in the rq in an UP setup. Signed-off-by: Henrik Austad Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5faf5d482fcd..648154cf1117 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -555,7 +555,6 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned long last_tick_seen; unsigned char in_nohz_recently; @@ -596,6 +595,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned char idle_at_tick; /* For active balancing */ int active_balance; int push_cpu; -- cgit v1.2.2 From 2b8f836fb196acede88b6cc772e9057e0a9c0223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Mon, 16 Feb 2009 18:54:21 +0800 Subject: sched: use TASK_NICE for task_struct #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) So it's better to use TASK_NICE here. Signed-off-by: WANG Cong Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 648154cf1117..5475d56a20f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5236,7 +5236,7 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = PRIO_TO_NICE(current->static_prio) + increment; + nice = TASK_NICE(current) + increment; if (nice < -20) nice = -20; if (nice > 19) -- cgit v1.2.2 From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fc17fd91ab57..9d30ac956328 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } -- cgit v1.2.2 From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2009 13:59:48 +0100 Subject: generic-ipi: remove CSD_FLAG_WAIT Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework the code so that we can use CSD_FLAG_LOCK for both purposes. Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Nick Piggin Cc: Jens Axboe Cc: "Paul E. McKenney" Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..d4c2749a2998 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } -- cgit v1.2.2 From c40c6f85a7594ad842233885386a0ca4cd40eafe Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 26 Feb 2009 15:40:15 +0800 Subject: cpuacct: add a branch prediction cpuacct_charge() is in fast-path, and checking of !cpuacct_susys.active always returns false after cpuacct has been initialized at system boot. Signed-off-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Menage Cc: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5475d56a20f1..8e63ffb6ed05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9684,7 +9684,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cpuacct *ca; int cpu; - if (!cpuacct_subsys.active) + if (unlikely(!cpuacct_subsys.active)) return; cpu = task_cpu(tsk); -- cgit v1.2.2 From b67802ea8061393f7bd2d4db934646e76096027c Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 2 Mar 2009 13:55:26 +0800 Subject: sched: kill unused parameter of pick_next_task() Impact: micro-optimization Parameter "prev" is not used really. Signed-off-by: Wang Chen Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dfae1bf6d5b2..9fe8e17574af 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4603,7 +4603,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; @@ -4678,7 +4678,7 @@ need_resched_nonpreemptible: idle_balance(cpu, rq); prev->sched_class->put_prev_task(rq, prev); - next = pick_next_task(rq, prev); + next = pick_next_task(rq); if (likely(prev != next)) { sched_info_switch(prev, next); @@ -6514,7 +6514,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) if (!rq->nr_running) break; update_rq_clock(rq); - next = pick_next_task(rq, rq->curr); + next = pick_next_task(rq); if (!next) break; next->sched_class->put_prev_task(rq, next); -- cgit v1.2.2 From 8a0be9ef8225638d26b455788f988c8f84ce9e75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Mar 2009 01:27:02 +0100 Subject: sched: don't rebalance if attached on NULL domain Impact: fix function graph trace hang / drop pointless softirq on UP While debugging a function graph trace hang on an old PII, I saw that it consumed most of its time on the timer interrupt. And the domain rebalancing softirq was the most concerned. The timer interrupt calls trigger_load_balance() which will decide if it is worth to schedule a rebalancing softirq. In case of builtin UP kernel, no problem arises because there is no domain question. In case of builtin SMP kernel running on an SMP box, still no problem, the softirq will be raised each time we reach the next_balance time. In case of builtin SMP kernel running on a UP box (most distros provide default SMP kernels, whatever the box you have), then the CPU is attached to the NULL sched domain. So a kind of unexpected behaviour happen: trigger_load_balance() -> raises the rebalancing softirq later on softirq: run_rebalance_domains() -> rebalance_domains() where the for_each_domain(cpu, sd) is not taken because of the NULL domain we are attached at. Which means rq->next_balance is never updated. So on the next timer tick, we will enter trigger_load_balance() which will always reschedule() the rebalacing softirq: if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); So for each tick, we process this pointless softirq. This patch fixes it by checking if we are attached to the null domain before raising the softirq, another possible fix would be to set the maximal possible JIFFIES value to rq->next_balance if we are attached to the NULL domain. v2: build fix on UP Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <49af242d.1c07d00a.32d5.ffffc019@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dfae1bf6d5b2..e509dbd7d77f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4148,6 +4148,11 @@ static void run_rebalance_domains(struct softirq_action *h) #endif } +static inline int on_null_domain(int cpu) +{ + return !rcu_dereference(cpu_rq(cpu)->sd); +} + /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * @@ -4205,7 +4210,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif - if (time_after_eq(jiffies, rq->next_balance)) + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); } -- cgit v1.2.2 From 5ed0cec0ac5f1b3759bdbe4d9df32ee4ff8afb5a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 19:40:20 +0800 Subject: sched: TIF_NEED_RESCHED -> need_reshed() cleanup Impact: cleanup Use test_tsk_need_resched(), set_tsk_need_resched(), need_resched() instead of using TIF_NEED_RESCHED. Signed-off-by: Lai Jiangshan Cc: Peter Zijlstra LKML-Reference: <49B10BA4.9070209@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8b92f40c147d..e0fa739a441b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1189,10 +1189,10 @@ static void resched_task(struct task_struct *p) assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (test_tsk_need_resched(p)) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_need_resched(p); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1248,7 +1248,7 @@ void wake_up_idle_cpu(int cpu) * lockless. The worst case is that the other CPU runs the * idle task through an additional NOOP schedule() */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + set_tsk_need_resched(rq->idle); /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -4740,7 +4740,7 @@ asmlinkage void __sched preempt_schedule(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); @@ -4769,7 +4769,7 @@ asmlinkage void __sched preempt_schedule_irq(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } #endif /* CONFIG_PREEMPT */ -- cgit v1.2.2 From 57310a98a354e84279d7c8af2f48805a62372e53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2009 13:56:21 +0100 Subject: sched: optimize ttwu vs group scheduling Impact: micro-optimization We can avoid the sched domain walk on try_to_wake_up() when we know there are no groups. Signed-off-by: Peter Zijlstra LKML-Reference: <1236603381.8389.455.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e0fa739a441b..af5cd1b2d03e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return list_empty(&root_task_group.children); +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return 1; +} +#endif + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -2318,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) sync = 0; #ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { + if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { struct sched_domain *sd; this_cpu = raw_smp_processor_id(); -- cgit v1.2.2 From df1c99d416500da8d26a4d78777467c53ee7689e Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 10 Mar 2009 19:08:11 +0100 Subject: sched: add avg_overlap decay Impact: more precise avg_overlap metric - better load-balancing avg_overlap is used to measure the runtime overlap of the waker and wakee. However, when a process changes behaviour, eg a pipe becomes un-congested and we don't need to go to sleep after a wakeup for a while, the avg_overlap value grows stale. When running we use the avg runtime between preemption as a measure for avg_overlap since the amount of runtime can be correlated to cache footprint. The longer we run, the less likely we'll be wanting to be migrated to another CPU. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1236709131.25234.576.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index af5cd1b2d03e..2f28351892c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4620,6 +4620,28 @@ static inline void schedule_debug(struct task_struct *prev) #endif } +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; + + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + + /* + * In order to avoid avg_overlap growing stale when we are + * indeed overlapping and hence not getting put to sleep, grow + * the avg_overlap on preemption. + * + * We use the average preemption runtime because that + * correlates to the amount of cache footprint a task can + * build up. + */ + update_avg(&prev->se.avg_overlap, runtime); + } + prev->sched_class->put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -4698,7 +4720,7 @@ need_resched_nonpreemptible: if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); next = pick_next_task(rq); if (likely(prev != next)) { -- cgit v1.2.2 From c69fc56de1df5769f2ec69c915c7ad5afe63804c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:46 +1030 Subject: cpumask: use topology_core_cpumask/topology_thread_cpumask instead of cpu_core_map/cpu_sibling_map Impact: cleanup This is presumably what those definitions are for, and while all archs define cpu_core_map/cpu_sibling map, that's changing (eg. x86 wants to change it to a pointer). Signed-off-by: Rusty Russell --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0a76d0b6f215..5dabd80c3c15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7249,7 +7249,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, { int group; - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; @@ -7278,7 +7278,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); #else group = cpu; @@ -7621,7 +7621,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7632,7 +7632,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { cpumask_and(this_sibling_map, - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); if (i != cpumask_first(this_sibling_map)) continue; -- cgit v1.2.2 From 80dd99b368cf6501be88ab517bbbb5bf352b75b8 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 16 Mar 2009 19:58:09 +0000 Subject: sched: fix typos in documentation Fixed typos in function documentation. Signed-off-by: Luis Henriques LKML-Reference: <20090316195809.GA6073@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2f28351892c9..489e7d926408 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2082,7 +2082,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * it must be off the runqueue _entirely_, and not * preempted! * - * So if it wa still runnable (but just not actively + * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -2574,7 +2574,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) #ifdef CONFIG_PREEMPT_NOTIFIERS /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) -- cgit v1.2.2 From 708dc5125309cd33c5daaad3026cc4ae6ef39c8b Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 16 Mar 2009 19:59:02 +0000 Subject: sched: small optimisation of can_migrate_task() There were 3 invocations of task_hot() in can_migrate_task(). Replace these 3 invocations by only one invocation, cached in a local variable. Signed-off-by: Luis Henriques LKML-Reference: <20090316195902.GA6197@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 489e7d926408..d2dfe4c1a225 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3002,6 +3002,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { + int tsk_cache_hot = 0; /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -3025,10 +3026,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { + tsk_cache_hot = task_hot(p, rq->clock, sd); + if (!tsk_cache_hot || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } @@ -3036,7 +3038,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 1; } - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } -- cgit v1.2.2 From df7c8e845e8e2030e8ae947e0ace56d184d0e9a0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Mar 2009 15:22:20 +1030 Subject: cpumask: remove cpumask allocation from idle_balance Impact: fix circular locking Steven reports a circular locking from alloc_cpumask_var doing a wakeup. We get rid of this using the tried-and-true technique of using a per-cpu cpumask_var_t rather than doing an alloc every time. Simpler and more robust than a rare, implicit allocation within an atomic codepath. Reported-by: Steven Rostedt Signed-off-by: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5dabd80c3c15..48862d418bed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3448,19 +3448,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 +/* Working cpumask for load_balance and load_balance_newidle. */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, struct cpumask *cpus) + int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3615,8 +3619,7 @@ out: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - struct cpumask *cpus) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; struct rq *busiest = NULL; @@ -3624,6 +3627,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3764,10 +3768,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_var_t tmpmask; - - if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) - return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3778,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, tmpmask); + sd); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3793,7 +3793,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - free_cpumask_var(tmpmask); } /* @@ -3943,11 +3942,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_var_t tmp; - - /* Fails alloc? Rebalancing probably not a priority right now. */ - if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) - return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3972,7 +3966,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -4006,8 +4000,6 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; - - free_cpumask_var(tmp); } /* @@ -8303,6 +8295,9 @@ void __init sched_init(void) #endif #ifdef CONFIG_USER_SCHED alloc_size *= 2; +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size *= num_possible_cpus() * cpumask_size(); #endif /* * As sched_init() is called before page_alloc is setup, @@ -8341,6 +8336,12 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_tmpmask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ } #ifdef CONFIG_SMP -- cgit v1.2.2 From 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Mar 2009 15:22:20 +1030 Subject: cpumask: remove cpumask allocation from idle_balance, fix Impact: fix boot crash Fix typo in the size calculation. Reported-by: Ingo Molnar Signed-off-by: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 48862d418bed..11dd52780adb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8297,7 +8297,7 @@ void __init sched_init(void) alloc_size *= 2; #endif #ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size *= num_possible_cpus() * cpumask_size(); + alloc_size += num_possible_cpus() * cpumask_size(); #endif /* * As sched_init() is called before page_alloc is setup, -- cgit v1.2.2 From 67aa0f767af488a7f1e41cccb4f7a4893f24a1ab Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 24 Mar 2009 22:10:02 +0000 Subject: sched: remove unused fields from struct rq Impact: cleanup, new schedstat ABI Since they are used on in statistics and are always set to zero, the following fields from struct rq have been removed: yld_exp_empty, yld_act_empty and yld_both_empty. Both Sched Debug and SCHEDSTAT_VERSION versions has also been incremented since ABIs have been changed. The schedtop tool has been updated to properly handle new version of schedstat: http://rt.wiki.kernel.org/index.php/Schedtop_utility Signed-off-by: Luis Henriques Acked-by: Gregory Haskins Acked-by: Peter Zijlstra LKML-Reference: <20090324221002.GA10061@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d2dfe4c1a225..7b389c74f8ff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,9 +638,6 @@ struct rq { /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; unsigned int yld_count; /* schedule() stats */ -- cgit v1.2.2 From 67bb6c036d1fc3d332c8527a36a546e3e72e822c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:35 +0530 Subject: sched: Simple helper functions for find_busiest_group() Impact: cleanup Currently the load idx calculation code is in find_busiest_group(). Move that to a static inline helper function. Similary, to find the first cpu of a sched_group we use cpumask_first(sched_group_cpus(group)) Use a helper to that. It improves readability in some cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091335.13992.55424.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7b389c74f8ff..6aec1e7a72a3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3189,6 +3189,43 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +/********** Helpers for find_busiest_group ************************/ + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, + enum cpu_idle_type idle) +{ + int load_idx; + + switch (idle) { + case CPU_NOT_IDLE: + load_idx = sd->busy_idx; + break; + + case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; + break; + default: + load_idx = sd->idle_idx; + break; + } + + return load_idx; +} +/******* find_busiest_group() helpers end here *********************/ /* * find_busiest_group finds and returns the busiest CPU group within the @@ -3217,12 +3254,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == CPU_NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; + load_idx = get_sd_load_idx(sd, idle); do { unsigned long load, group_capacity, max_cpu_load, min_cpu_load; @@ -3238,7 +3270,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sched_group_cpus(group)); if (local_group) - balance_cpu = cpumask_first(sched_group_cpus(group)); + balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3359,8 +3391,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) > - cpumask_first(sched_group_cpus(group_min)))) { + group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3375,8 +3406,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) < - cpumask_first(sched_group_cpus(group_leader)))) { + group_first_cpu(group) < + group_first_cpu(group_leader))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3504,7 +3535,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - cpumask_first(sched_group_cpus(group_leader)); + group_first_cpu(group_leader); } return group_min; } -- cgit v1.2.2 From 6dfdb0629019f307ab18864b1fd3e5dbb02f383c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:40 +0530 Subject: sched: Fix indentations in find_busiest_group() using gotos Impact: cleanup Some indentations in find_busiest_group() can minimized by using early exits with the help of gotos. This improves readability in a couple of cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091340.13992.45062.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6aec1e7a72a3..f87adbe999e0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3403,14 +3403,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - group_first_cpu(group) < - group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + if (sum_nr_running > group_capacity - 1) + goto group_next; + + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + group_first_cpu(group) < group_first_cpu(group_leader))) { + group_leader = group; + leader_nr_running = sum_nr_running; } group_next: #endif @@ -3531,14 +3531,16 @@ out_balanced: if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this == group_leader && group_leader != group_min) { - *imbalance = min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); - } - return group_min; + if (this != group_leader || group_leader == group_min) + goto ret; + + *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(group_leader); } + return group_min; + #endif ret: *imbalance = 0; -- cgit v1.2.2 From 381be78fdc829a22f6327a0ed09f54b6270a976d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:46 +0530 Subject: sched: Define structure to store the sched_group statistics for fbg() Impact: cleanup Currently a whole bunch of variables are used to store the various statistics pertaining to the groups we iterate over in find_busiest_group(). Group them together in a single data structure and add appropriate comments. This will be useful later on when we create helper functions to calculate the sched_group statistics. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091345.13992.20099.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f87adbe999e0..109db122de50 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3191,6 +3191,18 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, } /********** Helpers for find_busiest_group ************************/ +/** + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + int group_imb; /* Is there an imbalance in the group ? */ +}; + /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. @@ -3257,23 +3269,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = get_sd_load_idx(sd, idle); do { - unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + struct sg_lb_stats sgs; + unsigned long load, max_cpu_load, min_cpu_load; int local_group; int i; - int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_nr_running, sum_weighted_load; unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); + memset(&sgs, 0, sizeof(sgs)); if (local_group) balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; @@ -3301,9 +3312,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, min_cpu_load = load; } - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); + sgs.group_load += load; + sgs.sum_nr_running += rq->nr_running; + sgs.sum_weighted_load += weighted_cpuload(i); sum_avg_load_per_task += cpu_avg_load_per_task(i); } @@ -3320,12 +3331,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, goto ret; } - total_load += avg_load; + total_load += sgs.group_load; total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + sgs.avg_load = sg_div_cpu_power(group, + sgs.group_load * SCHED_LOAD_SCALE); /* @@ -3341,22 +3352,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_avg_load_per_task * SCHED_LOAD_SCALE); if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - __group_imb = 1; + sgs.group_imb = 1; - group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { - this_load = avg_load; + this_load = sgs.avg_load; this = group; - this_nr_running = sum_nr_running; - this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && - (sum_nr_running > group_capacity || __group_imb)) { - max_load = avg_load; + this_nr_running = sgs.sum_nr_running; + this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > max_load && + (sgs.sum_nr_running > sgs.group_capacity || + sgs.group_imb)) { + max_load = sgs.avg_load; busiest = group; - busiest_nr_running = sum_nr_running; - busiest_load_per_task = sum_weighted_load; - group_imb = __group_imb; + busiest_nr_running = sgs.sum_nr_running; + busiest_load_per_task = sgs.sum_weighted_load; + group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3372,7 +3384,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= group_capacity || + if (local_group && (this_nr_running >= sgs.group_capacity || !this_nr_running)) power_savings_balance = 0; @@ -3380,8 +3392,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || sum_nr_running >= group_capacity - || !sum_nr_running) + if (!power_savings_balance || + sgs.sum_nr_running >= sgs.group_capacity || + !sgs.sum_nr_running) goto group_next; /* @@ -3389,13 +3402,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + if ((sgs.sum_nr_running < min_nr_running) || + (sgs.sum_nr_running == min_nr_running && group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; - min_nr_running = sum_nr_running; - min_load_per_task = sum_weighted_load / - sum_nr_running; + min_nr_running = sgs.sum_nr_running; + min_load_per_task = sgs.sum_weighted_load / + sgs.sum_nr_running; } /* @@ -3403,14 +3416,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running > group_capacity - 1) + if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && + if (sgs.sum_nr_running > leader_nr_running || + (sgs.sum_nr_running == leader_nr_running && group_first_cpu(group) < group_first_cpu(group_leader))) { group_leader = group; - leader_nr_running = sum_nr_running; + leader_nr_running = sgs.sum_nr_running; } group_next: #endif -- cgit v1.2.2 From 1f8c553d0f11d85f7993fe21015695d266771c00 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:51 +0530 Subject: sched: Create a helper function to calculate sched_group stats for fbg() Impact: cleanup Create a helper function named update_sg_lb_stats() which can be invoked to calculate the individual group's statistics in find_busiest_group(). This reduces the lenght of find_busiest_group() considerably. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Aked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091351.13992.43461.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 175 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 100 insertions(+), 75 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 109db122de50..1893d5562f5f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } + + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @sd_idle: Idle status of the sched_domain containing group. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, + enum cpu_idle_type idle, int load_idx, int *sd_idle, + int local_group, const struct cpumask *cpus, + int *balance, struct sg_lb_stats *sgs) +{ + unsigned long load, max_cpu_load, min_cpu_load; + int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; + + if (local_group) + balance_cpu = group_first_cpu(group); + + /* Tally up the load of all CPUs in the group */ + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + if (*sd_idle && rq->nr_running) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } + + sgs->group_load += load; + sgs->sum_nr_running += rq->nr_running; + sgs->sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { + *balance = 0; + return; + } + + /* Adjust by relative CPU power of the group */ + sgs->avg_load = sg_div_cpu_power(group, + sgs->group_load * SCHED_LOAD_SCALE); + + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + sgs->group_imb = 1; + + sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, do { struct sg_lb_stats sgs; - unsigned long load, max_cpu_load, min_cpu_load; int local_group; - int i; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); + update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + local_group, cpus, balance, &sgs); - if (local_group) - balance_cpu = group_first_cpu(group); - - /* Tally up the load of all CPUs in the group */ - sum_avg_load_per_task = avg_load_per_task = 0; - - max_cpu_load = 0; - min_cpu_load = ~0UL; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { - struct rq *rq = cpu_rq(i); - - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - } - - sgs.group_load += load; - sgs.sum_nr_running += rq->nr_running; - sgs.sum_weighted_load += weighted_cpuload(i); - - sum_avg_load_per_task += cpu_avg_load_per_task(i); - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { - *balance = 0; + if (balance && !(*balance)) goto ret; - } total_load += sgs.group_load; total_pwr += group->__cpu_power; - /* Adjust by relative CPU power of the group */ - sgs.avg_load = sg_div_cpu_power(group, - sgs.group_load * SCHED_LOAD_SCALE); - - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); - - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - sgs.group_imb = 1; - - sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - if (local_group) { this_load = sgs.avg_load; this = group; -- cgit v1.2.2 From 222d656dea57e4e084fbd1e9383e6fed2ca9fa61 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:56 +0530 Subject: sched: Define structure to store the sched_domain statistics for fbg() Impact: cleanup Currently we use a lot of local variables in find_busiest_group() to capture the various statistics related to the sched_domain. Group them together into a single data structure. This will help us to offload the job of updating the sched_domain statistics to a helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091356.13992.25970.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 207 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 86 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1893d5562f5f..8198dbe8e4aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3190,6 +3190,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } /********** Helpers for find_busiest_group ************************/ +/** + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + + /* Statistics of the busiest group */ + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + + int group_imb; /* Is there imbalance in this sd */ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance; /* Is powersave balance needed for this sd */ + struct sched_group *group_min; /* Least loaded group in sd */ + struct sched_group *group_leader; /* Group which relieves group_min */ + unsigned long min_load_per_task; /* load_per_task in group_min */ + unsigned long leader_nr_running; /* Nr running of group_leader */ + unsigned long min_nr_running; /* Nr running of group_min */ +#endif +}; /** * sg_lb_stats - stats of a sched_group required for load_balancing @@ -3346,23 +3377,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; + struct sd_lb_stats sds; + struct sched_group *group = sd->groups; unsigned long max_pull; - unsigned long busiest_load_per_task, busiest_nr_running; - unsigned long this_load_per_task, this_nr_running; - int load_idx, group_imb = 0; + int load_idx; + + memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance = 1; - unsigned long leader_nr_running = 0, min_load_per_task = 0; - unsigned long min_nr_running = ULONG_MAX; - struct sched_group *group_min = NULL, *group_leader = NULL; + sds.power_savings_balance = 1; + sds.min_nr_running = ULONG_MAX; #endif - - max_load = this_load = total_load = total_pwr = 0; - busiest_load_per_task = busiest_nr_running = 0; - this_load_per_task = this_nr_running = 0; - load_idx = get_sd_load_idx(sd, idle); do { @@ -3378,22 +3402,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (balance && !(*balance)) goto ret; - total_load += sgs.group_load; - total_pwr += group->__cpu_power; + sds.total_load += sgs.group_load; + sds.total_pwr += group->__cpu_power; if (local_group) { - this_load = sgs.avg_load; - this = group; - this_nr_running = sgs.sum_nr_running; - this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > max_load && + sds.this_load = sgs.avg_load; + sds.this = group; + sds.this_nr_running = sgs.sum_nr_running; + sds.this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds.max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - max_load = sgs.avg_load; - busiest = group; - busiest_nr_running = sgs.sum_nr_running; - busiest_load_per_task = sgs.sum_weighted_load; - group_imb = sgs.group_imb; + sds.max_load = sgs.avg_load; + sds.busiest = group; + sds.busiest_nr_running = sgs.sum_nr_running; + sds.busiest_load_per_task = sgs.sum_weighted_load; + sds.group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3409,15 +3433,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= sgs.group_capacity || - !this_nr_running)) - power_savings_balance = 0; + if (local_group && + (sds.this_nr_running >= sgs.group_capacity || + !sds.this_nr_running)) + sds.power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || + if (!sds.power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3427,12 +3452,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < min_nr_running) || - (sgs.sum_nr_running == min_nr_running && - group_first_cpu(group) > group_first_cpu(group_min))) { - group_min = group; - min_nr_running = sgs.sum_nr_running; - min_load_per_task = sgs.sum_weighted_load / + if ((sgs.sum_nr_running < sds.min_nr_running) || + (sgs.sum_nr_running == sds.min_nr_running && + group_first_cpu(group) > + group_first_cpu(sds.group_min))) { + sds.group_min = group; + sds.min_nr_running = sgs.sum_nr_running; + sds.min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3444,29 +3470,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > leader_nr_running || - (sgs.sum_nr_running == leader_nr_running && - group_first_cpu(group) < group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sgs.sum_nr_running; + if (sgs.sum_nr_running > sds.leader_nr_running || + (sgs.sum_nr_running == sds.leader_nr_running && + group_first_cpu(group) < + group_first_cpu(sds.group_leader))) { + sds.group_leader = group; + sds.leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || busiest_nr_running == 0) + if (!sds.busiest || sds.this_load >= sds.max_load + || sds.busiest_nr_running == 0) goto out_balanced; - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) + if (sds.this_load >= sds.avg_load || + 100*sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; - busiest_load_per_task /= busiest_nr_running; - if (group_imb) - busiest_load_per_task = min(busiest_load_per_task, avg_load); + sds.busiest_load_per_task /= sds.busiest_nr_running; + if (sds.group_imb) + sds.busiest_load_per_task = + min(sds.busiest_load_per_task, sds.avg_load); /* * We're trying to get all the cpus to the average_load, so we don't @@ -3479,7 +3508,7 @@ group_next: * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ - if (max_load <= busiest_load_per_task) + if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; /* @@ -3487,17 +3516,18 @@ group_next: * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (max_load < avg_load) { + if (sds.max_load < sds.avg_load) { *imbalance = 0; goto small_imbalance; } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); + max_pull = min(sds.max_load - sds.avg_load, + sds.max_load - sds.busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->__cpu_power, - (avg_load - this_load) * this->__cpu_power) + *imbalance = min(max_pull * sds.busiest->__cpu_power, + (sds.avg_load - sds.this_load) * sds.this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -3506,24 +3536,27 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < busiest_load_per_task) { + if (*imbalance < sds.busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; small_imbalance: pwr_move = pwr_now = 0; imbn = 2; - if (this_nr_running) { - this_load_per_task /= this_nr_running; - if (busiest_load_per_task > this_load_per_task) + if (sds.this_nr_running) { + sds.this_load_per_task /= sds.this_nr_running; + if (sds.busiest_load_per_task > + sds.this_load_per_task) imbn = 1; } else - this_load_per_task = cpu_avg_load_per_task(this_cpu); - - if (max_load - this_load + busiest_load_per_task >= - busiest_load_per_task * imbn) { - *imbalance = busiest_load_per_task; - return busiest; + sds.this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds.max_load - sds.this_load + + sds.busiest_load_per_task >= + sds.busiest_load_per_task * imbn) { + *imbalance = sds.busiest_load_per_task; + return sds.busiest; } /* @@ -3532,52 +3565,54 @@ small_imbalance: * moving them. */ - pwr_now += busiest->__cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->__cpu_power * - min(this_load_per_task, this_load); + pwr_now += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, sds.max_load); + pwr_now += sds.this->__cpu_power * + min(sds.this_load_per_task, sds.this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(busiest, - busiest_load_per_task * SCHED_LOAD_SCALE); - if (max_load > tmp) - pwr_move += busiest->__cpu_power * - min(busiest_load_per_task, max_load - tmp); + tmp = sg_div_cpu_power(sds.busiest, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds.max_load > tmp) + pwr_move += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, + sds.max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->__cpu_power < - busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(this, - max_load * busiest->__cpu_power); + if (sds.max_load * sds.busiest->__cpu_power < + sds.busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds.this, + sds.max_load * sds.busiest->__cpu_power); else - tmp = sg_div_cpu_power(this, - busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += this->__cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(sds.this, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds.this->__cpu_power * + min(sds.this_load_per_task, + sds.this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - *imbalance = busiest_load_per_task; + *imbalance = sds.busiest_load_per_task; } - return busiest; + return sds.busiest; out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this != group_leader || group_leader == group_min) + if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) goto ret; - *imbalance = min_load_per_task; + *imbalance = sds.min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); + group_first_cpu(sds.group_leader); } - return group_min; + return sds.group_min; #endif ret: -- cgit v1.2.2 From 37abe198b1246ddd206319c43502a687db62d347 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:01 +0530 Subject: sched: Create a helper function to calculate sched_domain stats for fbg() Impact: cleanup Create a helper function named update_sd_lb_stats() to update the various sched_domain related statistics in find_busiest_group(). With this we would have moved all the statistics computation out of find_busiest_group(). Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091401.13992.88737.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 117 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 44 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8198dbe8e4aa..ec715f97202e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3365,32 +3365,33 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; } -/******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @sd_idle: Idle status of the sched_domain containing group. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, + enum cpu_idle_type idle, int *sd_idle, + const struct cpumask *cpus, int *balance, + struct sd_lb_stats *sds) { - struct sd_lb_stats sds; struct sched_group *group = sd->groups; - unsigned long max_pull; + struct sg_lb_stats sgs; int load_idx; - memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds.power_savings_balance = 1; - sds.min_nr_running = ULONG_MAX; + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; #endif load_idx = get_sd_load_idx(sd, idle); do { - struct sg_lb_stats sgs; int local_group; local_group = cpumask_test_cpu(this_cpu, @@ -3399,25 +3400,25 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); - if (balance && !(*balance)) - goto ret; + if (local_group && balance && !(*balance)) + return; - sds.total_load += sgs.group_load; - sds.total_pwr += group->__cpu_power; + sds->total_load += sgs.group_load; + sds->total_pwr += group->__cpu_power; if (local_group) { - sds.this_load = sgs.avg_load; - sds.this = group; - sds.this_nr_running = sgs.sum_nr_running; - sds.this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds.max_load && + sds->this_load = sgs.avg_load; + sds->this = group; + sds->this_nr_running = sgs.sum_nr_running; + sds->this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds->max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - sds.max_load = sgs.avg_load; - sds.busiest = group; - sds.busiest_nr_running = sgs.sum_nr_running; - sds.busiest_load_per_task = sgs.sum_weighted_load; - sds.group_imb = sgs.group_imb; + sds->max_load = sgs.avg_load; + sds->busiest = group; + sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3434,15 +3435,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * no need to do power savings balance at this domain */ if (local_group && - (sds.this_nr_running >= sgs.group_capacity || - !sds.this_nr_running)) - sds.power_savings_balance = 0; + (sds->this_nr_running >= sgs.group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!sds.power_savings_balance || + if (!sds->power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3452,13 +3453,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < sds.min_nr_running) || - (sgs.sum_nr_running == sds.min_nr_running && + if ((sgs.sum_nr_running < sds->min_nr_running) || + (sgs.sum_nr_running == sds->min_nr_running && group_first_cpu(group) > - group_first_cpu(sds.group_min))) { - sds.group_min = group; - sds.min_nr_running = sgs.sum_nr_running; - sds.min_load_per_task = sgs.sum_weighted_load / + group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs.sum_nr_running; + sds->min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3470,18 +3471,46 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > sds.leader_nr_running || - (sgs.sum_nr_running == sds.leader_nr_running && + if (sgs.sum_nr_running > sds->leader_nr_running || + (sgs.sum_nr_running == sds->leader_nr_running && group_first_cpu(group) < - group_first_cpu(sds.group_leader))) { - sds.group_leader = group; - sds.leader_nr_running = sgs.sum_nr_running; + group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); +} +/******* find_busiest_group() helpers end here *********************/ + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, const struct cpumask *cpus, int *balance) +{ + struct sd_lb_stats sds; + unsigned long max_pull; + + memset(&sds, 0, sizeof(sds)); + + /* + * Compute the various statistics relavent for load balancing at + * this level. + */ + update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, + balance, &sds); + + if (balance && !(*balance)) + goto ret; + if (!sds.busiest || sds.this_load >= sds.max_load || sds.busiest_nr_running == 0) goto out_balanced; -- cgit v1.2.2 From 2e6f44aeda426054fc58464df1ad571aecca0c92 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:06 +0530 Subject: sched: Create helper to calculate small_imbalance in fbg() Impact: cleanup We have two places in find_busiest_group() where we need to calculate the minor imbalance before returning the busiest group. Encapsulate this functionality into a seperate helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091406.13992.54316.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 131 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 61 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ec715f97202e..540147e5e82b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3484,6 +3484,71 @@ group_next: } while (group != sd->groups); } + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + * amongst the groups of a sched_domain, during + * load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned int imbn = 2; + + if (sds->this_nr_running) { + sds->this_load_per_task /= sds->this_nr_running; + if (sds->busiest_load_per_task > + sds->this_load_per_task) + imbn = 1; + } else + sds->this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= + sds->busiest_load_per_task * imbn) { + *imbalance = sds->busiest_load_per_task; + return; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load); + pwr_now += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = sg_div_cpu_power(sds->busiest, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds->max_load > tmp) + pwr_move += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load - tmp); + + /* Amount of load we'd add */ + if (sds->max_load * sds->busiest->__cpu_power < + sds->busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds->this, + sds->max_load * sds->busiest->__cpu_power); + else + tmp = sg_div_cpu_power(sds->this, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = sds->busiest_load_per_task; +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3547,7 +3612,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if (sds.max_load < sds.avg_load) { *imbalance = 0; - goto small_imbalance; + fix_small_imbalance(&sds, this_cpu, imbalance); + goto ret_busiest; } /* Don't want to pull so many tasks that a group would go idle */ @@ -3565,67 +3631,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < sds.busiest_load_per_task) { - unsigned long tmp, pwr_now, pwr_move; - unsigned int imbn; - -small_imbalance: - pwr_move = pwr_now = 0; - imbn = 2; - if (sds.this_nr_running) { - sds.this_load_per_task /= sds.this_nr_running; - if (sds.busiest_load_per_task > - sds.this_load_per_task) - imbn = 1; - } else - sds.this_load_per_task = - cpu_avg_load_per_task(this_cpu); - - if (sds.max_load - sds.this_load + - sds.busiest_load_per_task >= - sds.busiest_load_per_task * imbn) { - *imbalance = sds.busiest_load_per_task; - return sds.busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, sds.max_load); - pwr_now += sds.this->__cpu_power * - min(sds.this_load_per_task, sds.this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds.busiest, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - if (sds.max_load > tmp) - pwr_move += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, - sds.max_load - tmp); - - /* Amount of load we'd add */ - if (sds.max_load * sds.busiest->__cpu_power < - sds.busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds.this, - sds.max_load * sds.busiest->__cpu_power); - else - tmp = sg_div_cpu_power(sds.this, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds.this->__cpu_power * - min(sds.this_load_per_task, - sds.this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = sds.busiest_load_per_task; - } + if (*imbalance < sds.busiest_load_per_task) + fix_small_imbalance(&sds, this_cpu, imbalance); +ret_busiest: return sds.busiest; out_balanced: -- cgit v1.2.2 From dbc523a3b86f9e1765b5e70e6886913b99cc5cec Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:12 +0530 Subject: sched: Create a helper function to calculate imbalance Move all the imbalance calculation out of find_busiest_group() through this helper function. With this change, the structure of find_busiest_group() will be as follows: - update_sched_domain_statistics. - check if imbalance exits. - update imbalance and return busiest. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091411.13992.43293.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 540147e5e82b..934f615ccceb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3487,8 +3487,8 @@ group_next: /** * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. + * amongst the groups of a sched_domain, during + * load balancing. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: Variable to store the imbalance. @@ -3549,6 +3549,47 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, if (pwr_move > pwr_now) *imbalance = sds->busiest_load_per_task; } + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, + unsigned long *imbalance) +{ + unsigned long max_pull; + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (sds->max_load < sds->avg_load) { + *imbalance = 0; + return fix_small_imbalance(sds, this_cpu, imbalance); + } + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(sds->max_load - sds->avg_load, + sds->max_load - sds->busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * sds->busiest->__cpu_power, + (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(sds, this_cpu, imbalance); + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3562,7 +3603,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int *sd_idle, const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; - unsigned long max_pull; memset(&sds, 0, sizeof(sds)); @@ -3605,36 +3645,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (sds.max_load < sds.avg_load) { - *imbalance = 0; - fix_small_imbalance(&sds, this_cpu, imbalance); - goto ret_busiest; - } - - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(sds.max_load - sds.avg_load, - sds.max_load - sds.busiest_load_per_task); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds.busiest->__cpu_power, - (sds.avg_load - sds.this_load) * sds.this->__cpu_power) - / SCHED_LOAD_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < sds.busiest_load_per_task) - fix_small_imbalance(&sds, this_cpu, imbalance); - -ret_busiest: + /* Looks like there is an imbalance. Compute it */ + calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; out_balanced: -- cgit v1.2.2 From a021dc03376707c55a3483e32c16b8986d4414cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:17 +0530 Subject: sched: Optimize the !power_savings_balance during fbg() Impact: cleanup, micro-optimization We don't need to perform power_savings balance if either the cpu is NOT_IDLE or if the sched_domain doesn't contain the SD_POWERSAVINGS_BALANCE flag set. Currently, we check for these conditions multiple number of times, even though these variables don't change over the scope of find_busiest_group(). Check once, and store the value in the already exiting "power_savings_balance" variable. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091417.13992.2657.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 934f615ccceb..71e8dcaf2c79 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3386,8 +3386,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, int load_idx; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } #endif load_idx = get_sd_load_idx(sd, idle); @@ -3422,12 +3431,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) + + if (!sds->power_savings_balance) goto group_next; /* @@ -3651,7 +3656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (!sds.power_savings_balance) goto ret; if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) -- cgit v1.2.2 From c071df18525a95b37dd5821a6dc4af83bd18675e Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:22 +0530 Subject: sched: Refactor the power savings balance code Impact: cleanup Create seperate helper functions to initialize the power-savings-balance related variables, to update them and to check if we have a scope for performing power-savings balance. Add no-op inline functions for the !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT) case. This will eliminate all the #ifdef jungle in find_busiest_group() and the other helper functions. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091422.13992.73616.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 236 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 153 insertions(+), 83 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 71e8dcaf2c79..5f21658b0f67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3270,6 +3270,151 @@ static inline int get_sd_load_idx(struct sched_domain *sd, } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + + if (!sds->power_savings_balance) + return; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (sds->this_nr_running >= sgs->group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!sds->power_savings_balance || + sgs->sum_nr_running >= sgs->group_capacity || + !sgs->sum_nr_running) + return; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sgs->sum_nr_running < sds->min_nr_running) || + (sgs->sum_nr_running == sds->min_nr_running && + group_first_cpu(group) > group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs->sum_nr_running; + sds->min_load_per_task = sgs->sum_weighted_load / + sgs->sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sgs->sum_nr_running > sgs->group_capacity - 1) + return; + + if (sgs->sum_nr_running > sds->leader_nr_running || + (sgs->sum_nr_running == sds->leader_nr_running && + group_first_cpu(group) < group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs->sum_nr_running; + } +} + +/** + * check_power_save_busiest_group - Check if we have potential to perform + * some power-savings balance. If yes, set the busiest group to be + * the least loaded group in the sched_domain, so that it's CPUs can + * be put to idle. + * + * @sds: Variable containing the statistics of the sched_domain + * under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + if (!sds->power_savings_balance) + return 0; + + if (sds->this != sds->group_leader || + sds->group_leader == sds->group_min) + return 0; + + *imbalance = sds->min_load_per_task; + sds->busiest = sds->group_min; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(sds->group_leader); + } + + return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @group: sched_group whose statistics are to be updated. @@ -3385,19 +3530,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sg_lb_stats sgs; int load_idx; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -#endif + init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); do { @@ -3430,61 +3563,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->group_imb = sgs.group_imb; } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - - if (!sds->power_savings_balance) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && - (sds->this_nr_running >= sgs.group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs.sum_nr_running >= sgs.group_capacity || - !sgs.sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs.sum_nr_running < sds->min_nr_running) || - (sgs.sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > - group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs.sum_nr_running; - sds->min_load_per_task = sgs.sum_weighted_load / - sgs.sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs.sum_nr_running > sgs.group_capacity - 1) - goto group_next; - - if (sgs.sum_nr_running > sds->leader_nr_running || - (sgs.sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < - group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs.sum_nr_running; - } -group_next: -#endif + update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); @@ -3655,21 +3734,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, return sds.busiest; out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (!sds.power_savings_balance) - goto ret; - - if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) - goto ret; - - *imbalance = sds.min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds.group_leader); - } - return sds.group_min; - -#endif + /* + * There is no obvious imbalance. But check if we can do some balancing + * to save power. + */ + if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) + return sds.busiest; ret: *imbalance = 0; return NULL; -- cgit v1.2.2 From b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:27 +0530 Subject: sched: Add comments to find_busiest_group() function Impact: cleanup Add /** style comments around find_busiest_group(). Also add a few explanatory comments. This concludes the find_busiest_group() cleanup. The function is now down to 72 lines from the original 313 lines. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091427.13992.18933.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5f21658b0f67..9f8506d68fdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,10 +3676,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, } /******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + * be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, @@ -3697,17 +3717,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds); + /* Cases where imbalance does not exist from POV of this_cpu */ + /* 1) this_cpu is not the appropriate cpu to perform load balancing + * at this level. + * 2) There is no busy sibling group to pull from. + * 3) This group is the busiest group. + * 4) This group is more busy than the avg busieness at this + * sched_domain. + * 5) The imbalance is within the specified limit. + * 6) Any rebalance would lead to ping-pong + */ if (balance && !(*balance)) goto ret; - if (!sds.busiest || sds.this_load >= sds.max_load - || sds.busiest_nr_running == 0) + if (!sds.busiest || sds.busiest_nr_running == 0) + goto out_balanced; + + if (sds.this_load >= sds.max_load) goto out_balanced; sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load || - 100*sds.max_load <= sd->imbalance_pct * sds.this_load) + if (sds.this_load >= sds.avg_load) + goto out_balanced; + + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; sds.busiest_load_per_task /= sds.busiest_nr_running; -- cgit v1.2.2 From d5ac537e5fb6fc12384c9f3ed6a15e912dfbbc2a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Mar 2009 21:52:47 -0700 Subject: sched: fix errors in struct & function comments Fix kernel-doc errors in sched.c: the structs don't have kernel-doc notation and the short function description needs to be one line only. Error(kernel/sched.c:3197): cannot understand prototype: 'struct sd_lb_stats ' Error(kernel/sched.c:3228): cannot understand prototype: 'struct sg_lb_stats ' Error(kernel/sched.c:3375): duplicate section name 'Description' Signed-off-by: Randy Dunlap cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f4c413bdd38d..5757e03cfac0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3190,7 +3190,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } /********** Helpers for find_busiest_group ************************/ -/** +/* * sd_lb_stats - Structure to store the statistics of a sched_domain * during load balancing. */ @@ -3222,7 +3222,7 @@ struct sd_lb_stats { #endif }; -/** +/* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { @@ -3360,16 +3360,17 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, } /** - * check_power_save_busiest_group - Check if we have potential to perform - * some power-savings balance. If yes, set the busiest group to be - * the least loaded group in the sched_domain, so that it's CPUs can - * be put to idle. - * + * check_power_save_busiest_group - see if there is potential for some power-savings balance * @sds: Variable containing the statistics of the sched_domain * under consideration. * @this_cpu: Cpu at which we're currently performing load-balancing. * @imbalance: Variable to store the imbalance. * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * * Returns 1 if there is potential to perform power-savings balance. * Else returns 0. */ -- cgit v1.2.2 From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Tue, 31 Mar 2009 15:24:20 -0700 Subject: epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key() This patchset introduces wakeup hints for some of the most popular (from epoll POV) devices, so that epoll code can avoid spurious wakeups on its waiters. The problem with epoll is that the callback-based wakeups do not, ATM, carry any information about the events the wakeup is related to. So the only choice epoll has (not being able to call f_op->poll() from inside the callback), is to add the file* to a ready-list and resolve the real events later on, at epoll_wait() (or its own f_op->poll()) time. This can cause spurious wakeups, since the wake_up() itself might be for an event the caller is not interested into. The rate of these spurious wakeup can be pretty high in case of many network sockets being monitored. By allowing devices to report the events the wakeups refer to (at least the two major classes - POLLIN/POLLOUT), we are able to spare useless wakeups by proper handling inside the epoll's poll callback. Epoll will have in any case to call f_op->poll() on the file* later on, since the change to be done in order to have the full event set sent via wakeup, is too invasive for the way our f_op->poll() system works (the full event set is calculated inside the poll function - there are too many of them to even start thinking the change - also poll/select would need change too). Epoll is changed in a way that both devices which send event hints, and the ones that don't, are correctly handled. The former will gain some efficiency though. As a general rule for devices, would be to add an event mask by using key-aware wakeup macros, when making up poll wait queues. I tested it (together with the epoll's poll fix patch Andrew has in -mm) and wakeups for the supported devices are correctly filtered. Test program available here: http://www.xmailserver.org/epoll_test.c This patch: Nothing revolutionary here. Just using the available "key" that our wakeup core already support. The __wake_up_locked_key() was no brainer, since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers around __wake_up_common(). The __wake_up_sync() function had a body, so the choice was between borrowing the body for __wake_up_sync_key() and calling it from __wake_up_sync(), or make an inline and calling it from both. I chose the former since in most archs it all resolves to "mov $0, REG; jmp ADDR". Signed-off-by: Davide Libenzi Cc: Alan Cox Cc: Ingo Molnar Cc: David Miller Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 196d48babbef..73513f4e19df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} + /** - * __wake_up_sync - wake up threads blocked on a waitqueue. + * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not @@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) { unsigned long flags; int sync = 1; @@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) sync = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); + __wake_up_common(q, mode, nr_exclusive, sync, key); spin_unlock_irqrestore(&q->lock, flags); } +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ /** -- cgit v1.2.2