[PATCH] sched: fix SMT scheduling problems

SMT balancing has a couple of problems. Firstly, active_load_balance is too complex - basically it should be a dumb helper for when the periodic balancer has determined there is an imbalance, but gets stuck because the task is running. So rip out all its "smarts", and just make it move one task to the target CPU. Second, the busy CPU's sched-domain tree was being used for active balancing. This means that it may not see that nr_balance_failed has reached a critical level. So use the target CPU's sched-domain tree for this. We can do this because we hold its runqueue lock. Lastly, reset nr_balance_failed to a point where we allow cache hot migration. This will help ensure active load balancing is successful. Thanks to Suresh Siddha for pointing out these issues. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Nick Piggin <nickpiggin@yahoo.com.au> 2005-06-25 17:57:09 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-25 19:24:41 -0400
commit: 3950745131e23472fb5ace2ee4a2093e7590ec69 (patch)
tree: 8b3e738f2c11ee3e4c60d8960e7bdd3c006f7154 /kernel/sched.c
parent: 16cfb1c04c3cbe3759f339d3333e7e1e7d59712a (diff)
1 files changed, 31 insertions, 45 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 03d737791c1a..41e69b5ee652 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1995,7 +1995,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                         * We've kicked active balancing, reset the failure
                         * counter.
                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries;
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
                }
        } else
                sd->nr_balance_failed = 0;
@@ -2106,56 +2106,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
        struct sched_domain *sd;
-        struct sched_group *cpu_group;
        runqueue_t *target_rq;
-        cpumask_t visited_cpus;
+        int target_cpu = busiest_rq->push_cpu;
-        int cpu;
+        if (busiest_rq->nr_running <= 1)
+                /* no task to move */
+                return;
+        target_rq = cpu_rq(target_cpu);
        /*
-         * Search for suitable CPUs to push tasks to in successively higher
+         * This condition is "impossible", if it occurs
-         * domains with SD_LOAD_BALANCE set.
+         * we need to fix it.  Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
         */
-        visited_cpus = CPU_MASK_NONE;
+        BUG_ON(busiest_rq == target_rq);
-        for_each_domain(busiest_cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        /* no more domains to search */
-                        break;
-                schedstat_inc(sd, alb_cnt);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
-                cpu_group = sd->groups;
+        /* Search for an sd spanning us and the target CPU. */
-                do {
+        for_each_domain(target_cpu, sd)
-                        for_each_cpu_mask(cpu, cpu_group->cpumask) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
-                                if (busiest_rq->nr_running <= 1)
+                        cpu_isset(busiest_cpu, sd->span))
-                                        /* no more tasks left to move */
+                                break;
-                                        return;
-                                if (cpu_isset(cpu, visited_cpus))
+        if (unlikely(sd == NULL))
-                                        continue;
+                goto out;
-                                cpu_set(cpu, visited_cpus);
-                                if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
+        schedstat_inc(sd, alb_cnt);
-                                        continue;
+        if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
-                                target_rq = cpu_rq(cpu);
+                schedstat_inc(sd, alb_pushed);
-                                /*
+        else
-                                 * This condition is "impossible", if it occurs
+                schedstat_inc(sd, alb_failed);
-                                 * we need to fix it.  Originally reported by
+out:
-                                 * Bjorn Helgaas on a 128-cpu setup.
+        spin_unlock(&target_rq->lock);
-                                 */
-                                BUG_ON(busiest_rq == target_rq);
-                                /* move a task from busiest_rq to target_rq */
-                                double_lock_balance(busiest_rq, target_rq);
-                                if (move_tasks(target_rq, cpu, busiest_rq,
-                                                1, sd, SCHED_IDLE, NULL)) {
-                                        schedstat_inc(sd, alb_pushed);
-                                } else {
-                                        schedstat_inc(sd, alb_failed);
-                                }
-                                spin_unlock(&target_rq->lock);
-                        }
-                        cpu_group = cpu_group->next;
-                } while (cpu_group != sd->groups);
-        }
 }
 /*
author	Nick Piggin <nickpiggin@yahoo.com.au>	2005-06-25 17:57:09 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-25 19:24:41 -0400
commit	3950745131e23472fb5ace2ee4a2093e7590ec69 (patch)
tree	8b3e738f2c11ee3e4c60d8960e7bdd3c006f7154 /kernel/sched.c
parent	16cfb1c04c3cbe3759f339d3333e7e1e7d59712a (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 03d737791c1a..41e69b5ee652 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1995,7 +1995,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1995	* We've kicked active balancing, reset the failure	1995	* We've kicked active balancing, reset the failure
1996	* counter.	1996	* counter.
1997	*/	1997	*/
1998	sd->nr_balance_failed = sd->cache_nice_tries;	1998	sd->nr_balance_failed = sd->cache_nice_tries+1;
1999	}	1999	}
2000	} else	2000	} else
2001	sd->nr_balance_failed = 0;	2001	sd->nr_balance_failed = 0;
@@ -2106,56 +2106,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2106	static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)	2106	static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2107	{	2107	{
2108	struct sched_domain *sd;	2108	struct sched_domain *sd;
2109	struct sched_group *cpu_group;
2110	runqueue_t *target_rq;	2109	runqueue_t *target_rq;
2111	cpumask_t visited_cpus;	2110	int target_cpu = busiest_rq->push_cpu;
2112	int cpu;	2111
		2112	if (busiest_rq->nr_running <= 1)
		2113	/* no task to move */
		2114	return;
		2115
		2116	target_rq = cpu_rq(target_cpu);
2113		2117
2114	/*	2118	/*
2115	* Search for suitable CPUs to push tasks to in successively higher	2119	* This condition is "impossible", if it occurs
2116	* domains with SD_LOAD_BALANCE set.	2120	* we need to fix it. Originally reported by
		2121	* Bjorn Helgaas on a 128-cpu setup.
2117	*/	2122	*/
2118	visited_cpus = CPU_MASK_NONE;	2123	BUG_ON(busiest_rq == target_rq);
2119	for_each_domain(busiest_cpu, sd) {
2120	if (!(sd->flags & SD_LOAD_BALANCE))
2121	/* no more domains to search */
2122	break;
2123		2124
2124	schedstat_inc(sd, alb_cnt);	2125	/* move a task from busiest_rq to target_rq */
		2126	double_lock_balance(busiest_rq, target_rq);
2125		2127
2126	cpu_group = sd->groups;	2128	/* Search for an sd spanning us and the target CPU. */
2127	do {	2129	for_each_domain(target_cpu, sd)
2128	for_each_cpu_mask(cpu, cpu_group->cpumask) {	2130	if ((sd->flags & SD_LOAD_BALANCE) &&
2129	if (busiest_rq->nr_running <= 1)	2131	cpu_isset(busiest_cpu, sd->span))
2130	/* no more tasks left to move */	2132	break;
2131	return;	2133
2132	if (cpu_isset(cpu, visited_cpus))	2134	if (unlikely(sd == NULL))
2133	continue;	2135	goto out;
2134	cpu_set(cpu, visited_cpus);	2136
2135	if (!cpu_and_siblings_are_idle(cpu) \|\| cpu == busiest_cpu)	2137	schedstat_inc(sd, alb_cnt);
2136	continue;	2138
2137		2139	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2138	target_rq = cpu_rq(cpu);	2140	schedstat_inc(sd, alb_pushed);
2139	/*	2141	else
2140	* This condition is "impossible", if it occurs	2142	schedstat_inc(sd, alb_failed);
2141	* we need to fix it. Originally reported by	2143	out:
2142	* Bjorn Helgaas on a 128-cpu setup.	2144	spin_unlock(&target_rq->lock);
2143	*/
2144	BUG_ON(busiest_rq == target_rq);
2145
2146	/* move a task from busiest_rq to target_rq */
2147	double_lock_balance(busiest_rq, target_rq);
2148	if (move_tasks(target_rq, cpu, busiest_rq,
2149	1, sd, SCHED_IDLE, NULL)) {
2150	schedstat_inc(sd, alb_pushed);
2151	} else {
2152	schedstat_inc(sd, alb_failed);
2153	}
2154	spin_unlock(&target_rq->lock);
2155	}
2156	cpu_group = cpu_group->next;
2157	} while (cpu_group != sd->groups);
2158	}
2159	}	2145	}
2160		2146
2161	/*	2147	/*