diff options
author | Nick Piggin <nickpiggin@yahoo.com.au> | 2005-06-25 17:57:19 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-25 19:24:42 -0400 |
commit | 147cbb4bbe991452698f0772d8292f22825710ba (patch) | |
tree | cb86550d7e440e7dfbe22b0af6d2cfc991cb76cf /kernel/sched.c | |
parent | cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 (diff) |
[PATCH] sched: balance on fork
Reimplement the balance on exec balancing to be sched-domains aware. Use this
to also do balance on fork balancing. Make x86_64 do balance on fork over the
NUMA domain.
The problem that the non sched domains aware blancing became apparent on dual
core, multi socket opterons. What we want is for the new tasks to be sent to
a different socket, but more often than not, we would first load up our
sibling core, or fill two cores of a single remote socket before selecting a
new one.
This gives large improvements to STREAM on such systems.
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 164 |
1 files changed, 109 insertions, 55 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 396724a2519f..7ecc237e2aab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type) | |||
893 | return max(rq->cpu_load[type-1], load_now); | 893 | return max(rq->cpu_load[type-1], load_now); |
894 | } | 894 | } |
895 | 895 | ||
896 | /* | ||
897 | * find_idlest_group finds and returns the least busy CPU group within the | ||
898 | * domain. | ||
899 | */ | ||
900 | static struct sched_group * | ||
901 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
902 | { | ||
903 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
904 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
905 | int load_idx = sd->forkexec_idx; | ||
906 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
907 | |||
908 | do { | ||
909 | unsigned long load, avg_load; | ||
910 | int local_group; | ||
911 | int i; | ||
912 | |||
913 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
914 | /* XXX: put a cpus allowed check */ | ||
915 | |||
916 | /* Tally up the load of all CPUs in the group */ | ||
917 | avg_load = 0; | ||
918 | |||
919 | for_each_cpu_mask(i, group->cpumask) { | ||
920 | /* Bias balancing toward cpus of our domain */ | ||
921 | if (local_group) | ||
922 | load = source_load(i, load_idx); | ||
923 | else | ||
924 | load = target_load(i, load_idx); | ||
925 | |||
926 | avg_load += load; | ||
927 | } | ||
928 | |||
929 | /* Adjust by relative CPU power of the group */ | ||
930 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
931 | |||
932 | if (local_group) { | ||
933 | this_load = avg_load; | ||
934 | this = group; | ||
935 | } else if (avg_load < min_load) { | ||
936 | min_load = avg_load; | ||
937 | idlest = group; | ||
938 | } | ||
939 | group = group->next; | ||
940 | } while (group != sd->groups); | ||
941 | |||
942 | if (!idlest || 100*this_load < imbalance*min_load) | ||
943 | return NULL; | ||
944 | return idlest; | ||
945 | } | ||
946 | |||
947 | /* | ||
948 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | ||
949 | */ | ||
950 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | ||
951 | { | ||
952 | unsigned long load, min_load = ULONG_MAX; | ||
953 | int idlest = -1; | ||
954 | int i; | ||
955 | |||
956 | for_each_cpu_mask(i, group->cpumask) { | ||
957 | load = source_load(i, 0); | ||
958 | |||
959 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
960 | min_load = load; | ||
961 | idlest = i; | ||
962 | } | ||
963 | } | ||
964 | |||
965 | return idlest; | ||
966 | } | ||
967 | |||
968 | |||
896 | #endif | 969 | #endif |
897 | 970 | ||
898 | /* | 971 | /* |
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
1107 | return try_to_wake_up(p, state, 0); | 1180 | return try_to_wake_up(p, state, 0); |
1108 | } | 1181 | } |
1109 | 1182 | ||
1110 | #ifdef CONFIG_SMP | ||
1111 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1112 | struct sched_domain *sd); | ||
1113 | #endif | ||
1114 | |||
1115 | /* | 1183 | /* |
1116 | * Perform scheduler related setup for a newly forked process p. | 1184 | * Perform scheduler related setup for a newly forked process p. |
1117 | * p is forked by current. | 1185 | * p is forked by current. |
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
1181 | unsigned long flags; | 1249 | unsigned long flags; |
1182 | int this_cpu, cpu; | 1250 | int this_cpu, cpu; |
1183 | runqueue_t *rq, *this_rq; | 1251 | runqueue_t *rq, *this_rq; |
1252 | #ifdef CONFIG_SMP | ||
1253 | struct sched_domain *tmp, *sd = NULL; | ||
1254 | #endif | ||
1184 | 1255 | ||
1185 | rq = task_rq_lock(p, &flags); | 1256 | rq = task_rq_lock(p, &flags); |
1186 | cpu = task_cpu(p); | 1257 | BUG_ON(p->state != TASK_RUNNING); |
1187 | this_cpu = smp_processor_id(); | 1258 | this_cpu = smp_processor_id(); |
1259 | cpu = task_cpu(p); | ||
1188 | 1260 | ||
1189 | BUG_ON(p->state != TASK_RUNNING); | 1261 | #ifdef CONFIG_SMP |
1262 | for_each_domain(cpu, tmp) | ||
1263 | if (tmp->flags & SD_BALANCE_FORK) | ||
1264 | sd = tmp; | ||
1265 | |||
1266 | if (sd) { | ||
1267 | struct sched_group *group; | ||
1268 | |||
1269 | cpu = task_cpu(p); | ||
1270 | group = find_idlest_group(sd, p, cpu); | ||
1271 | if (group) { | ||
1272 | int new_cpu; | ||
1273 | new_cpu = find_idlest_cpu(group, cpu); | ||
1274 | if (new_cpu != -1 && new_cpu != cpu && | ||
1275 | cpu_isset(new_cpu, p->cpus_allowed)) { | ||
1276 | set_task_cpu(p, new_cpu); | ||
1277 | task_rq_unlock(rq, &flags); | ||
1278 | rq = task_rq_lock(p, &flags); | ||
1279 | cpu = task_cpu(p); | ||
1280 | } | ||
1281 | } | ||
1282 | } | ||
1283 | #endif | ||
1190 | 1284 | ||
1191 | /* | 1285 | /* |
1192 | * We decrease the sleep average of forking parents | 1286 | * We decrease the sleep average of forking parents |
@@ -1481,51 +1575,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1481 | } | 1575 | } |
1482 | 1576 | ||
1483 | /* | 1577 | /* |
1484 | * find_idlest_cpu - find the least busy runqueue. | ||
1485 | */ | ||
1486 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1487 | struct sched_domain *sd) | ||
1488 | { | ||
1489 | unsigned long load, min_load, this_load; | ||
1490 | int i, min_cpu; | ||
1491 | cpumask_t mask; | ||
1492 | |||
1493 | min_cpu = UINT_MAX; | ||
1494 | min_load = ULONG_MAX; | ||
1495 | |||
1496 | cpus_and(mask, sd->span, p->cpus_allowed); | ||
1497 | |||
1498 | for_each_cpu_mask(i, mask) { | ||
1499 | load = target_load(i, sd->wake_idx); | ||
1500 | |||
1501 | if (load < min_load) { | ||
1502 | min_cpu = i; | ||
1503 | min_load = load; | ||
1504 | |||
1505 | /* break out early on an idle CPU: */ | ||
1506 | if (!min_load) | ||
1507 | break; | ||
1508 | } | ||
1509 | } | ||
1510 | |||
1511 | /* add +1 to account for the new task */ | ||
1512 | this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE; | ||
1513 | |||
1514 | /* | ||
1515 | * Would with the addition of the new task to the | ||
1516 | * current CPU there be an imbalance between this | ||
1517 | * CPU and the idlest CPU? | ||
1518 | * | ||
1519 | * Use half of the balancing threshold - new-context is | ||
1520 | * a good opportunity to balance. | ||
1521 | */ | ||
1522 | if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) | ||
1523 | return min_cpu; | ||
1524 | |||
1525 | return this_cpu; | ||
1526 | } | ||
1527 | |||
1528 | /* | ||
1529 | * If dest_cpu is allowed for this process, migrate the task to it. | 1578 | * If dest_cpu is allowed for this process, migrate the task to it. |
1530 | * This is accomplished by forcing the cpu_allowed mask to only | 1579 | * This is accomplished by forcing the cpu_allowed mask to only |
1531 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1580 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
@@ -1578,8 +1627,15 @@ void sched_exec(void) | |||
1578 | sd = tmp; | 1627 | sd = tmp; |
1579 | 1628 | ||
1580 | if (sd) { | 1629 | if (sd) { |
1630 | struct sched_group *group; | ||
1581 | schedstat_inc(sd, sbe_attempts); | 1631 | schedstat_inc(sd, sbe_attempts); |
1582 | new_cpu = find_idlest_cpu(current, this_cpu, sd); | 1632 | group = find_idlest_group(sd, current, this_cpu); |
1633 | if (!group) | ||
1634 | goto out; | ||
1635 | new_cpu = find_idlest_cpu(group, this_cpu); | ||
1636 | if (new_cpu == -1) | ||
1637 | goto out; | ||
1638 | |||
1583 | if (new_cpu != this_cpu) { | 1639 | if (new_cpu != this_cpu) { |
1584 | schedstat_inc(sd, sbe_pushed); | 1640 | schedstat_inc(sd, sbe_pushed); |
1585 | put_cpu(); | 1641 | put_cpu(); |
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1792 | if (local_group) { | 1848 | if (local_group) { |
1793 | this_load = avg_load; | 1849 | this_load = avg_load; |
1794 | this = group; | 1850 | this = group; |
1795 | goto nextgroup; | ||
1796 | } else if (avg_load > max_load) { | 1851 | } else if (avg_load > max_load) { |
1797 | max_load = avg_load; | 1852 | max_load = avg_load; |
1798 | busiest = group; | 1853 | busiest = group; |
1799 | } | 1854 | } |
1800 | nextgroup: | ||
1801 | group = group->next; | 1855 | group = group->next; |
1802 | } while (group != sd->groups); | 1856 | } while (group != sd->groups); |
1803 | 1857 | ||