aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2005-06-25 17:57:19 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:42 -0400
commit147cbb4bbe991452698f0772d8292f22825710ba (patch)
treecb86550d7e440e7dfbe22b0af6d2cfc991cb76cf /kernel
parentcafb20c1f9976a70d633bb1e1c8c24eab00e4e80 (diff)
[PATCH] sched: balance on fork
Reimplement the balance on exec balancing to be sched-domains aware. Use this to also do balance on fork balancing. Make x86_64 do balance on fork over the NUMA domain. The problem that the non sched domains aware blancing became apparent on dual core, multi socket opterons. What we want is for the new tasks to be sent to a different socket, but more often than not, we would first load up our sibling core, or fill two cores of a single remote socket before selecting a new one. This gives large improvements to STREAM on such systems. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c164
1 files changed, 109 insertions, 55 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519f..7ecc237e2aab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
893 return max(rq->cpu_load[type-1], load_now); 893 return max(rq->cpu_load[type-1], load_now);
894} 894}
895 895
896/*
897 * find_idlest_group finds and returns the least busy CPU group within the
898 * domain.
899 */
900static struct sched_group *
901find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
902{
903 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
904 unsigned long min_load = ULONG_MAX, this_load = 0;
905 int load_idx = sd->forkexec_idx;
906 int imbalance = 100 + (sd->imbalance_pct-100)/2;
907
908 do {
909 unsigned long load, avg_load;
910 int local_group;
911 int i;
912
913 local_group = cpu_isset(this_cpu, group->cpumask);
914 /* XXX: put a cpus allowed check */
915
916 /* Tally up the load of all CPUs in the group */
917 avg_load = 0;
918
919 for_each_cpu_mask(i, group->cpumask) {
920 /* Bias balancing toward cpus of our domain */
921 if (local_group)
922 load = source_load(i, load_idx);
923 else
924 load = target_load(i, load_idx);
925
926 avg_load += load;
927 }
928
929 /* Adjust by relative CPU power of the group */
930 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
931
932 if (local_group) {
933 this_load = avg_load;
934 this = group;
935 } else if (avg_load < min_load) {
936 min_load = avg_load;
937 idlest = group;
938 }
939 group = group->next;
940 } while (group != sd->groups);
941
942 if (!idlest || 100*this_load < imbalance*min_load)
943 return NULL;
944 return idlest;
945}
946
947/*
948 * find_idlest_queue - find the idlest runqueue among the cpus in group.
949 */
950static int find_idlest_cpu(struct sched_group *group, int this_cpu)
951{
952 unsigned long load, min_load = ULONG_MAX;
953 int idlest = -1;
954 int i;
955
956 for_each_cpu_mask(i, group->cpumask) {
957 load = source_load(i, 0);
958
959 if (load < min_load || (load == min_load && i == this_cpu)) {
960 min_load = load;
961 idlest = i;
962 }
963 }
964
965 return idlest;
966}
967
968
896#endif 969#endif
897 970
898/* 971/*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1107 return try_to_wake_up(p, state, 0); 1180 return try_to_wake_up(p, state, 0);
1108} 1181}
1109 1182
1110#ifdef CONFIG_SMP
1111static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1112 struct sched_domain *sd);
1113#endif
1114
1115/* 1183/*
1116 * Perform scheduler related setup for a newly forked process p. 1184 * Perform scheduler related setup for a newly forked process p.
1117 * p is forked by current. 1185 * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1181 unsigned long flags; 1249 unsigned long flags;
1182 int this_cpu, cpu; 1250 int this_cpu, cpu;
1183 runqueue_t *rq, *this_rq; 1251 runqueue_t *rq, *this_rq;
1252#ifdef CONFIG_SMP
1253 struct sched_domain *tmp, *sd = NULL;
1254#endif
1184 1255
1185 rq = task_rq_lock(p, &flags); 1256 rq = task_rq_lock(p, &flags);
1186 cpu = task_cpu(p); 1257 BUG_ON(p->state != TASK_RUNNING);
1187 this_cpu = smp_processor_id(); 1258 this_cpu = smp_processor_id();
1259 cpu = task_cpu(p);
1188 1260
1189 BUG_ON(p->state != TASK_RUNNING); 1261#ifdef CONFIG_SMP
1262 for_each_domain(cpu, tmp)
1263 if (tmp->flags & SD_BALANCE_FORK)
1264 sd = tmp;
1265
1266 if (sd) {
1267 struct sched_group *group;
1268
1269 cpu = task_cpu(p);
1270 group = find_idlest_group(sd, p, cpu);
1271 if (group) {
1272 int new_cpu;
1273 new_cpu = find_idlest_cpu(group, cpu);
1274 if (new_cpu != -1 && new_cpu != cpu &&
1275 cpu_isset(new_cpu, p->cpus_allowed)) {
1276 set_task_cpu(p, new_cpu);
1277 task_rq_unlock(rq, &flags);
1278 rq = task_rq_lock(p, &flags);
1279 cpu = task_cpu(p);
1280 }
1281 }
1282 }
1283#endif
1190 1284
1191 /* 1285 /*
1192 * We decrease the sleep average of forking parents 1286 * We decrease the sleep average of forking parents
@@ -1481,51 +1575,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1481} 1575}
1482 1576
1483/* 1577/*
1484 * find_idlest_cpu - find the least busy runqueue.
1485 */
1486static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1487 struct sched_domain *sd)
1488{
1489 unsigned long load, min_load, this_load;
1490 int i, min_cpu;
1491 cpumask_t mask;
1492
1493 min_cpu = UINT_MAX;
1494 min_load = ULONG_MAX;
1495
1496 cpus_and(mask, sd->span, p->cpus_allowed);
1497
1498 for_each_cpu_mask(i, mask) {
1499 load = target_load(i, sd->wake_idx);
1500
1501 if (load < min_load) {
1502 min_cpu = i;
1503 min_load = load;
1504
1505 /* break out early on an idle CPU: */
1506 if (!min_load)
1507 break;
1508 }
1509 }
1510
1511 /* add +1 to account for the new task */
1512 this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
1513
1514 /*
1515 * Would with the addition of the new task to the
1516 * current CPU there be an imbalance between this
1517 * CPU and the idlest CPU?
1518 *
1519 * Use half of the balancing threshold - new-context is
1520 * a good opportunity to balance.
1521 */
1522 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1523 return min_cpu;
1524
1525 return this_cpu;
1526}
1527
1528/*
1529 * If dest_cpu is allowed for this process, migrate the task to it. 1578 * If dest_cpu is allowed for this process, migrate the task to it.
1530 * This is accomplished by forcing the cpu_allowed mask to only 1579 * This is accomplished by forcing the cpu_allowed mask to only
1531 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1580 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1578,8 +1627,15 @@ void sched_exec(void)
1578 sd = tmp; 1627 sd = tmp;
1579 1628
1580 if (sd) { 1629 if (sd) {
1630 struct sched_group *group;
1581 schedstat_inc(sd, sbe_attempts); 1631 schedstat_inc(sd, sbe_attempts);
1582 new_cpu = find_idlest_cpu(current, this_cpu, sd); 1632 group = find_idlest_group(sd, current, this_cpu);
1633 if (!group)
1634 goto out;
1635 new_cpu = find_idlest_cpu(group, this_cpu);
1636 if (new_cpu == -1)
1637 goto out;
1638
1583 if (new_cpu != this_cpu) { 1639 if (new_cpu != this_cpu) {
1584 schedstat_inc(sd, sbe_pushed); 1640 schedstat_inc(sd, sbe_pushed);
1585 put_cpu(); 1641 put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1848 if (local_group) {
1793 this_load = avg_load; 1849 this_load = avg_load;
1794 this = group; 1850 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1851 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1852 max_load = avg_load;
1798 busiest = group; 1853 busiest = group;
1799 } 1854 }
1800nextgroup:
1801 group = group->next; 1855 group = group->next;
1802 } while (group != sd->groups); 1856 } while (group != sd->groups);
1803 1857