[PATCH] sched: balance on fork

Reimplement the balance on exec balancing to be sched-domains aware. Use this to also do balance on fork balancing. Make x86_64 do balance on fork over the NUMA domain. The problem that the non sched domains aware blancing became apparent on dual core, multi socket opterons. What we want is for the new tasks to be sent to a different socket, but more often than not, we would first load up our sibling core, or fill two cores of a single remote socket before selecting a new one. This gives large improvements to STREAM on such systems. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Nick Piggin <nickpiggin@yahoo.com.au> 2005-06-25 17:57:19 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-25 19:24:42 -0400
commit: 147cbb4bbe991452698f0772d8292f22825710ba (patch)
tree: cb86550d7e440e7dfbe22b0af6d2cfc991cb76cf /kernel/sched.c
parent: cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 (diff)
1 files changed, 109 insertions, 55 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519f..7ecc237e2aab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
        return max(rq->cpu_load[type-1], load_now);
 }
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+        unsigned long min_load = ULONG_MAX, this_load = 0;
+        int load_idx = sd->forkexec_idx;
+        int imbalance = 100 + (sd->imbalance_pct-100)/2;
+        do {
+                unsigned long load, avg_load;
+                int local_group;
+                int i;
+                local_group = cpu_isset(this_cpu, group->cpumask);
+                /* XXX: put a cpus allowed check */
+                /* Tally up the load of all CPUs in the group */
+                avg_load = 0;
+                for_each_cpu_mask(i, group->cpumask) {
+                        /* Bias balancing toward cpus of our domain */
+                        if (local_group)
+                                load = source_load(i, load_idx);
+                        else
+                                load = target_load(i, load_idx);
+                        avg_load += load;
+                }
+                /* Adjust by relative CPU power of the group */
+                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                if (local_group) {
+                        this_load = avg_load;
+                        this = group;
+                } else if (avg_load < min_load) {
+                        min_load = avg_load;
+                        idlest = group;
+                }
+                group = group->next;
+        } while (group != sd->groups);
+        if (!idlest || 100*this_load < imbalance*min_load)
+                return NULL;
+        return idlest;
+}
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+        unsigned long load, min_load = ULONG_MAX;
+        int idlest = -1;
+        int i;
+        for_each_cpu_mask(i, group->cpumask) {
+                load = source_load(i, 0);
+                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        min_load = load;
+                        idlest = i;
+                }
+        }
+        return idlest;
+}
 #endif
 /*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
        return try_to_wake_up(p, state, 0);
 }
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                           struct sched_domain *sd);
-#endif
 /*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
        unsigned long flags;
        int this_cpu, cpu;
        runqueue_t *rq, *this_rq;
+#ifdef CONFIG_SMP
+        struct sched_domain *tmp, *sd = NULL;
+#endif
        rq = task_rq_lock(p, &flags);
-        cpu = task_cpu(p);
+        BUG_ON(p->state != TASK_RUNNING);
        this_cpu = smp_processor_id();
+        cpu = task_cpu(p);
-        BUG_ON(p->state != TASK_RUNNING);
+#ifdef CONFIG_SMP
+        for_each_domain(cpu, tmp)
+                if (tmp->flags & SD_BALANCE_FORK)
+                        sd = tmp;
+        if (sd) {
+                struct sched_group *group;
+                cpu = task_cpu(p);
+                group = find_idlest_group(sd, p, cpu);
+                if (group) {
+                        int new_cpu;
+                        new_cpu = find_idlest_cpu(group, cpu);
+                        if (new_cpu != -1 && new_cpu != cpu &&
+                                        cpu_isset(new_cpu, p->cpus_allowed)) {
+                                set_task_cpu(p, new_cpu);
+                                task_rq_unlock(rq, &flags);
+                                rq = task_rq_lock(p, &flags);
+                                cpu = task_cpu(p);
+                        }
+                }
+        }
+#endif
        /*
         * We decrease the sleep average of forking parents
@@ -1481,51 +1575,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 }
 /*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                           struct sched_domain *sd)
-{
-        unsigned long load, min_load, this_load;
-        int i, min_cpu;
-        cpumask_t mask;
-        min_cpu = UINT_MAX;
-        min_load = ULONG_MAX;
-        cpus_and(mask, sd->span, p->cpus_allowed);
-        for_each_cpu_mask(i, mask) {
-                load = target_load(i, sd->wake_idx);
-                if (load < min_load) {
-                        min_cpu = i;
-                        min_load = load;
-                        /* break out early on an idle CPU: */
-                        if (!min_load)
-                                break;
-                }
-        }
-        /* add +1 to account for the new task */
-        this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
-        /*
-         * Would with the addition of the new task to the
-         * current CPU there be an imbalance between this
-         * CPU and the idlest CPU?
-         *
-         * Use half of the balancing threshold - new-context is
-         * a good opportunity to balance.
-         */
-        if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-                return min_cpu;
-        return this_cpu;
-}
-/*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
 * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
@@ -1578,8 +1627,15 @@ void sched_exec(void)
                        sd = tmp;
        if (sd) {
+                struct sched_group *group;
                schedstat_inc(sd, sbe_attempts);
-                new_cpu = find_idlest_cpu(current, this_cpu, sd);
+                group = find_idlest_group(sd, current, this_cpu);
+                if (!group)
+                        goto out;
+                new_cpu = find_idlest_cpu(group, this_cpu);
+                if (new_cpu == -1)
+                        goto out;
                if (new_cpu != this_cpu) {
                        schedstat_inc(sd, sbe_pushed);
                        put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (local_group) {
                        this_load = avg_load;
                        this = group;
-                        goto nextgroup;
                } else if (avg_load > max_load) {
                        max_load = avg_load;
                        busiest = group;
                }
-nextgroup:
                group = group->next;
        } while (group != sd->groups);
author	Nick Piggin <nickpiggin@yahoo.com.au>	2005-06-25 17:57:19 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-25 19:24:42 -0400
commit	147cbb4bbe991452698f0772d8292f22825710ba (patch)
tree	cb86550d7e440e7dfbe22b0af6d2cfc991cb76cf /kernel/sched.c
parent	cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 396724a2519f..7ecc237e2aab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
893	return max(rq->cpu_load[type-1], load_now);	893	return max(rq->cpu_load[type-1], load_now);
894	}	894	}
895		895
		896	/*
		897	* find_idlest_group finds and returns the least busy CPU group within the
		898	* domain.
		899	*/
		900	static struct sched_group *
		901	find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
		902	{
		903	struct sched_group idlest = NULL, this = NULL, *group = sd->groups;
		904	unsigned long min_load = ULONG_MAX, this_load = 0;
		905	int load_idx = sd->forkexec_idx;
		906	int imbalance = 100 + (sd->imbalance_pct-100)/2;
		907
		908	do {
		909	unsigned long load, avg_load;
		910	int local_group;
		911	int i;
		912
		913	local_group = cpu_isset(this_cpu, group->cpumask);
		914	/* XXX: put a cpus allowed check */
		915
		916	/* Tally up the load of all CPUs in the group */
		917	avg_load = 0;
		918
		919	for_each_cpu_mask(i, group->cpumask) {
		920	/* Bias balancing toward cpus of our domain */
		921	if (local_group)
		922	load = source_load(i, load_idx);
		923	else
		924	load = target_load(i, load_idx);
		925
		926	avg_load += load;
		927	}
		928
		929	/* Adjust by relative CPU power of the group */
		930	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
		931
		932	if (local_group) {
		933	this_load = avg_load;
		934	this = group;
		935	} else if (avg_load < min_load) {
		936	min_load = avg_load;
		937	idlest = group;
		938	}
		939	group = group->next;
		940	} while (group != sd->groups);
		941
		942	if (!idlest \|\| 100this_load < imbalancemin_load)
		943	return NULL;
		944	return idlest;
		945	}
		946
		947	/*
		948	* find_idlest_queue - find the idlest runqueue among the cpus in group.
		949	*/
		950	static int find_idlest_cpu(struct sched_group *group, int this_cpu)
		951	{
		952	unsigned long load, min_load = ULONG_MAX;
		953	int idlest = -1;
		954	int i;
		955
		956	for_each_cpu_mask(i, group->cpumask) {
		957	load = source_load(i, 0);
		958
		959	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
		960	min_load = load;
		961	idlest = i;
		962	}
		963	}
		964
		965	return idlest;
		966	}
		967
		968
896	#endif	969	#endif
897		970
898	/*	971	/*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1107	return try_to_wake_up(p, state, 0);	1180	return try_to_wake_up(p, state, 0);
1108	}	1181	}
1109		1182
1110	#ifdef CONFIG_SMP
1111	static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1112	struct sched_domain *sd);
1113	#endif
1114
1115	/*	1183	/*
1116	* Perform scheduler related setup for a newly forked process p.	1184	* Perform scheduler related setup for a newly forked process p.
1117	* p is forked by current.	1185	* p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1181	unsigned long flags;	1249	unsigned long flags;
1182	int this_cpu, cpu;	1250	int this_cpu, cpu;
1183	runqueue_t rq, this_rq;	1251	runqueue_t rq, this_rq;
		1252	#ifdef CONFIG_SMP
		1253	struct sched_domain tmp, sd = NULL;
		1254	#endif
1184		1255
1185	rq = task_rq_lock(p, &flags);	1256	rq = task_rq_lock(p, &flags);
1186	cpu = task_cpu(p);	1257	BUG_ON(p->state != TASK_RUNNING);
1187	this_cpu = smp_processor_id();	1258	this_cpu = smp_processor_id();
		1259	cpu = task_cpu(p);
1188		1260
1189	BUG_ON(p->state != TASK_RUNNING);	1261	#ifdef CONFIG_SMP
		1262	for_each_domain(cpu, tmp)
		1263	if (tmp->flags & SD_BALANCE_FORK)
		1264	sd = tmp;
		1265
		1266	if (sd) {
		1267	struct sched_group *group;
		1268
		1269	cpu = task_cpu(p);
		1270	group = find_idlest_group(sd, p, cpu);
		1271	if (group) {
		1272	int new_cpu;
		1273	new_cpu = find_idlest_cpu(group, cpu);
		1274	if (new_cpu != -1 && new_cpu != cpu &&
		1275	cpu_isset(new_cpu, p->cpus_allowed)) {
		1276	set_task_cpu(p, new_cpu);
		1277	task_rq_unlock(rq, &flags);
		1278	rq = task_rq_lock(p, &flags);
		1279	cpu = task_cpu(p);
		1280	}
		1281	}
		1282	}
		1283	#endif
1190		1284
1191	/*	1285	/*
1192	* We decrease the sleep average of forking parents	1286	* We decrease the sleep average of forking parents
@@ -1481,51 +1575,6 @@ static void double_lock_balance(runqueue_t this_rq, runqueue_t busiest)
1481	}	1575	}
1482		1576
1483	/*	1577	/*
1484	* find_idlest_cpu - find the least busy runqueue.
1485	*/
1486	static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1487	struct sched_domain *sd)
1488	{
1489	unsigned long load, min_load, this_load;
1490	int i, min_cpu;
1491	cpumask_t mask;
1492
1493	min_cpu = UINT_MAX;
1494	min_load = ULONG_MAX;
1495
1496	cpus_and(mask, sd->span, p->cpus_allowed);
1497
1498	for_each_cpu_mask(i, mask) {
1499	load = target_load(i, sd->wake_idx);
1500
1501	if (load < min_load) {
1502	min_cpu = i;
1503	min_load = load;
1504
1505	/* break out early on an idle CPU: */
1506	if (!min_load)
1507	break;
1508	}
1509	}
1510
1511	/* add +1 to account for the new task */
1512	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
1513
1514	/*
1515	* Would with the addition of the new task to the
1516	* current CPU there be an imbalance between this
1517	* CPU and the idlest CPU?
1518	*
1519	* Use half of the balancing threshold - new-context is
1520	* a good opportunity to balance.
1521	*/
1522	if (min_load(100 + (sd->imbalance_pct-100)/2) < this_load100)
1523	return min_cpu;
1524
1525	return this_cpu;
1526	}
1527
1528	/*
1529	* If dest_cpu is allowed for this process, migrate the task to it.	1578	* If dest_cpu is allowed for this process, migrate the task to it.
1530	* This is accomplished by forcing the cpu_allowed mask to only	1579	* This is accomplished by forcing the cpu_allowed mask to only
1531	* allow dest_cpu, which will force the cpu onto dest_cpu. Then	1580	* allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1578,8 +1627,15 @@ void sched_exec(void)
1578	sd = tmp;	1627	sd = tmp;
1579		1628
1580	if (sd) {	1629	if (sd) {
		1630	struct sched_group *group;
1581	schedstat_inc(sd, sbe_attempts);	1631	schedstat_inc(sd, sbe_attempts);
1582	new_cpu = find_idlest_cpu(current, this_cpu, sd);	1632	group = find_idlest_group(sd, current, this_cpu);
		1633	if (!group)
		1634	goto out;
		1635	new_cpu = find_idlest_cpu(group, this_cpu);
		1636	if (new_cpu == -1)
		1637	goto out;
		1638
1583	if (new_cpu != this_cpu) {	1639	if (new_cpu != this_cpu) {
1584	schedstat_inc(sd, sbe_pushed);	1640	schedstat_inc(sd, sbe_pushed);
1585	put_cpu();	1641	put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792	if (local_group) {	1848	if (local_group) {
1793	this_load = avg_load;	1849	this_load = avg_load;
1794	this = group;	1850	this = group;
1795	goto nextgroup;
1796	} else if (avg_load > max_load) {	1851	} else if (avg_load > max_load) {
1797	max_load = avg_load;	1852	max_load = avg_load;
1798	busiest = group;	1853	busiest = group;
1799	}	1854	}
1800	nextgroup:
1801	group = group->next;	1855	group = group->next;
1802	} while (group != sd->groups);	1856	} while (group != sd->groups);
1803		1857