[PATCH] sched: remove smpnice

I don't think the code is quite ready, which is why I asked for Peter's additions to also be merged before I acked it (although it turned out that it still isn't quite ready with his additions either). Basically I have had similar observations to Suresh in that it does not play nicely with the rest of the balancing infrastructure (and raised similar concerns in my review). The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2 SMP+HT Xeon are as follows: | Following boot | hackbench 20 | hackbench 40 -----------+----------------+---------------------+--------------------- 2.6.16-rc2 | 30,37,100,112 | 5600,5530,6020,6090 | 6390,7090,8760,8470 +nosmpnice | 3, 2, 4, 2 | 28, 150, 294, 132 | 348, 348, 294, 347 Hackbench raw performance is down around 15% with smpnice (but that in itself isn't a huge deal because it is just a benchmark). However, the samples show that the imbalance passed into move_tasks is increased by about a factor of 10-30. I think this would also go some way to explaining latency blips turning up in the balancing code (though I haven't actually measured that). We'll probably have to revert this in the SUSE kernel. Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Peter Williams <pwil3058@bigpond.net.au> Cc: "Martin J. Bligh" <mbligh@aracnet.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Nick Piggin <npiggin@suse.de> 2006-02-10 04:51:02 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-02-10 11:13:11 -0500
commit: a2000572ad511f5f43091ed7bd2cc3b913104a1e (patch)
tree: 56279392dece8e11c33d3754854551aa6773095b /kernel/sched.c
parent: 4b0955a6edb9b058ca1314ca210a92ee166c4d9a (diff)
1 files changed, 18 insertions, 111 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index bc38804e40dd..87d93be336a1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -215,7 +215,6 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
-        unsigned long prio_bias;
        unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
@@ -669,68 +668,13 @@ static int effective_prio(task_t *p)
        return prio;
 }
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-        rq->prio_bias += MAX_PRIO - prio;
-}
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-        rq->prio_bias -= MAX_PRIO - prio;
-}
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running++;
-        if (rt_task(p)) {
-                if (p != rq->migration_thread)
-                        /*
-                         * The migration thread does the actual balancing. Do
-                         * not bias by its priority as the ultra high priority
-                         * will skew balancing adversely.
-                         */
-                        inc_prio_bias(rq, p->prio);
-        } else
-                inc_prio_bias(rq, p->static_prio);
-}
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running--;
-        if (rt_task(p)) {
-                if (p != rq->migration_thread)
-                        dec_prio_bias(rq, p->prio);
-        } else
-                dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running++;
-}
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running--;
-}
-#endif
 /*
 * __activate_task - move a task to the runqueue.
 */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task(p, rq->active);
-        inc_nr_running(p, rq);
+        rq->nr_running++;
 }
 /*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task_head(p, rq->active);
-        inc_nr_running(p, rq);
+        rq->nr_running++;
 }
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -863,7 +807,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-        dec_nr_running(p, rq);
+        rq->nr_running--;
        dequeue_task(p, p->array);
        p->array = NULL;
 }
@@ -1007,61 +951,27 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long running = rq->nr_running;
+        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-        unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                source_load = load_now;
+                return load_now;
-        else
-                source_load = min(cpu_load, load_now);
-        if (running > 1 || (idle == NOT_IDLE && running))
-                /*
-                 * If we are busy rebalancing the load is biased by
-                 * priority to create 'nice' support across cpus. When
-                 * idle rebalancing we should only bias the source_load if
-                 * there is more than one task running on that queue to
-                 * prevent idle rebalance from trying to pull tasks from a
-                 * queue with only one running task.
-                 */
-                source_load = source_load * rq->prio_bias / running;
-        return source_load;
+        return min(rq->cpu_load[type-1], load_now);
-}
-static inline unsigned long source_load(int cpu, int type)
-{
-        return __source_load(cpu, type, NOT_IDLE);
 }
 /*
 * Return a high guess at the load of a migration-target cpu
 */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long running = rq->nr_running;
+        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-        unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                target_load = load_now;
+                return load_now;
-        else
-                target_load = max(cpu_load, load_now);
-        if (running > 1 || (idle == NOT_IDLE && running))
+        return max(rq->cpu_load[type-1], load_now);
-                target_load = target_load * rq->prio_bias / running;
-        return target_load;
-}
-static inline unsigned long target_load(int cpu, int type)
-{
-        return __target_load(cpu, type, NOT_IDLE);
 }
 /*
@@ -1530,7 +1440,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                list_add_tail(&p->run_list, &current->run_list);
                                p->array = current->array;
                                p->array->nr_active++;
-                                inc_nr_running(p, rq);
+                                rq->nr_running++;
                        }
                        set_need_resched();
                } else
@@ -1875,9 +1785,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
        dequeue_task(p, src_array);
-        dec_nr_running(p, src_rq);
+        src_rq->nr_running--;
        set_task_cpu(p, this_cpu);
-        inc_nr_running(p, this_rq);
+        this_rq->nr_running++;
        enqueue_task(p, this_array);
        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                + this_rq->timestamp_last_tick;
@@ -2056,9 +1966,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                                load = __target_load(i, load_idx, idle);
+                                load = target_load(i, load_idx);
                        else
-                                load = __source_load(i, load_idx, idle);
+                                load = source_load(i, load_idx);
                        avg_load += load;
                }
@@ -2171,7 +2081,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
        int i;
        for_each_cpu_mask(i, group->cpumask) {
-                load = __source_load(i, 0, idle);
+                load = source_load(i, 0);
                if (load > max_load) {
                        max_load = load;
@@ -3571,10 +3481,8 @@ void set_user_nice(task_t *p, long nice)
                goto out_unlock;
        }
        array = p->array;
-        if (array) {
+        if (array)
                dequeue_task(p, array);
-                dec_prio_bias(rq, p->static_prio);
-        }
        old_prio = p->prio;
        new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3492,6 @@ void set_user_nice(task_t *p, long nice)
        if (array) {
                enqueue_task(p, array);
-                inc_prio_bias(rq, p->static_prio);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
author	Nick Piggin <npiggin@suse.de>	2006-02-10 04:51:02 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-02-10 11:13:11 -0500
commit	a2000572ad511f5f43091ed7bd2cc3b913104a1e (patch)
tree	56279392dece8e11c33d3754854551aa6773095b /kernel/sched.c
parent	4b0955a6edb9b058ca1314ca210a92ee166c4d9a (diff)