diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2007-05-08 03:32:57 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-05-08 14:15:17 -0400 |
commit | 5517d86bea237c1d7078840182d9ebc0fe4c1afc (patch) | |
tree | 67f1999895313878bfa904c66dffb7066f3c8d91 | |
parent | 46cb4b7c88fa5517f64b5bee42939ea3614cddcb (diff) |
Speed up divides by cpu_power in scheduler
I noticed expensive divides done in try_to_wakeup() and
find_busiest_group() on a bi dual core Opteron machine (total of 4 cores),
moderatly loaded (15.000 context switch per second)
oprofile numbers :
CPU: AMD64 processors, speed 2600.05 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 50000
samples % symbol name
...
613914 1.0498 try_to_wake_up
834 0.0013 :ffffffff80227ae1: div %rcx
77513 0.1191 :ffffffff80227ae4: mov %rax,%r11
608893 1.0413 find_busiest_group
1841 0.0031 :ffffffff802260bf: div %rdi
140109 0.2394 :ffffffff802260c2: test %sil,%sil
Some of these divides can use the reciprocal divides we introduced some
time ago (currently used in slab AFAIK)
We can assume a load will fit in a 32bits number, because with a
SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432
When/if we reach this limit one day, probably cpus will have a fast
hardware divide and we can zap the reciprocal divide trick.
Ingo suggested to rename cpu_power to __cpu_power to make clear it should
not be modified without changing its reciprocal value too.
I did not convert the divide in cpu_avg_load_per_task(), because tracking
nr_running changes may be not worth it ? We could use a static table of 32
reciprocal values but it would add a conditional branch and table lookup.
[akpm@linux-foundation.org: !SMP build fix]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | kernel/sched.c | 83 |
2 files changed, 61 insertions, 30 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 15ab3e039535..3d95c480f58d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -680,8 +680,14 @@ struct sched_group { | |||
680 | /* | 680 | /* |
681 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 681 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
682 | * single CPU. This is read only (except for setup, hotplug CPU). | 682 | * single CPU. This is read only (except for setup, hotplug CPU). |
683 | * Note : Never change cpu_power without recompute its reciprocal | ||
683 | */ | 684 | */ |
684 | unsigned long cpu_power; | 685 | unsigned int __cpu_power; |
686 | /* | ||
687 | * reciprocal value of cpu_power to avoid expensive divides | ||
688 | * (see include/linux/reciprocal_div.h) | ||
689 | */ | ||
690 | u32 reciprocal_cpu_power; | ||
685 | }; | 691 | }; |
686 | 692 | ||
687 | struct sched_domain { | 693 | struct sched_domain { |
diff --git a/kernel/sched.c b/kernel/sched.c index 74599286230c..e4a5888549a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -52,8 +52,9 @@ | |||
52 | #include <linux/tsacct_kern.h> | 52 | #include <linux/tsacct_kern.h> |
53 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | 54 | #include <linux/delayacct.h> |
55 | #include <asm/tlb.h> | 55 | #include <linux/reciprocal_div.h> |
56 | 56 | ||
57 | #include <asm/tlb.h> | ||
57 | #include <asm/unistd.h> | 58 | #include <asm/unistd.h> |
58 | 59 | ||
59 | /* | 60 | /* |
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) | |||
181 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | 182 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
182 | } | 183 | } |
183 | 184 | ||
185 | #ifdef CONFIG_SMP | ||
186 | /* | ||
187 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
188 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
189 | */ | ||
190 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
191 | { | ||
192 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Each time a sched group cpu_power is changed, | ||
197 | * we must compute its reciprocal value | ||
198 | */ | ||
199 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
200 | { | ||
201 | sg->__cpu_power += val; | ||
202 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
203 | } | ||
204 | #endif | ||
205 | |||
184 | /* | 206 | /* |
185 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 207 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
186 | * to time slice values: [800ms ... 100ms ... 5ms] | 208 | * to time slice values: [800ms ... 100ms ... 5ms] |
@@ -1256,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1256 | } | 1278 | } |
1257 | 1279 | ||
1258 | /* Adjust by relative CPU power of the group */ | 1280 | /* Adjust by relative CPU power of the group */ |
1259 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1281 | avg_load = sg_div_cpu_power(group, |
1282 | avg_load * SCHED_LOAD_SCALE); | ||
1260 | 1283 | ||
1261 | if (local_group) { | 1284 | if (local_group) { |
1262 | this_load = avg_load; | 1285 | this_load = avg_load; |
@@ -2367,12 +2390,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2367 | } | 2390 | } |
2368 | 2391 | ||
2369 | total_load += avg_load; | 2392 | total_load += avg_load; |
2370 | total_pwr += group->cpu_power; | 2393 | total_pwr += group->__cpu_power; |
2371 | 2394 | ||
2372 | /* Adjust by relative CPU power of the group */ | 2395 | /* Adjust by relative CPU power of the group */ |
2373 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2396 | avg_load = sg_div_cpu_power(group, |
2397 | avg_load * SCHED_LOAD_SCALE); | ||
2374 | 2398 | ||
2375 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | 2399 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
2376 | 2400 | ||
2377 | if (local_group) { | 2401 | if (local_group) { |
2378 | this_load = avg_load; | 2402 | this_load = avg_load; |
@@ -2483,8 +2507,8 @@ group_next: | |||
2483 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 2507 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2484 | 2508 | ||
2485 | /* How much load to actually move to equalise the imbalance */ | 2509 | /* How much load to actually move to equalise the imbalance */ |
2486 | *imbalance = min(max_pull * busiest->cpu_power, | 2510 | *imbalance = min(max_pull * busiest->__cpu_power, |
2487 | (avg_load - this_load) * this->cpu_power) | 2511 | (avg_load - this_load) * this->__cpu_power) |
2488 | / SCHED_LOAD_SCALE; | 2512 | / SCHED_LOAD_SCALE; |
2489 | 2513 | ||
2490 | /* | 2514 | /* |
@@ -2518,28 +2542,29 @@ small_imbalance: | |||
2518 | * moving them. | 2542 | * moving them. |
2519 | */ | 2543 | */ |
2520 | 2544 | ||
2521 | pwr_now += busiest->cpu_power * | 2545 | pwr_now += busiest->__cpu_power * |
2522 | min(busiest_load_per_task, max_load); | 2546 | min(busiest_load_per_task, max_load); |
2523 | pwr_now += this->cpu_power * | 2547 | pwr_now += this->__cpu_power * |
2524 | min(this_load_per_task, this_load); | 2548 | min(this_load_per_task, this_load); |
2525 | pwr_now /= SCHED_LOAD_SCALE; | 2549 | pwr_now /= SCHED_LOAD_SCALE; |
2526 | 2550 | ||
2527 | /* Amount of load we'd subtract */ | 2551 | /* Amount of load we'd subtract */ |
2528 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2552 | tmp = sg_div_cpu_power(busiest, |
2529 | busiest->cpu_power; | 2553 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2530 | if (max_load > tmp) | 2554 | if (max_load > tmp) |
2531 | pwr_move += busiest->cpu_power * | 2555 | pwr_move += busiest->__cpu_power * |
2532 | min(busiest_load_per_task, max_load - tmp); | 2556 | min(busiest_load_per_task, max_load - tmp); |
2533 | 2557 | ||
2534 | /* Amount of load we'd add */ | 2558 | /* Amount of load we'd add */ |
2535 | if (max_load * busiest->cpu_power < | 2559 | if (max_load * busiest->__cpu_power < |
2536 | busiest_load_per_task * SCHED_LOAD_SCALE) | 2560 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2537 | tmp = max_load * busiest->cpu_power / this->cpu_power; | 2561 | tmp = sg_div_cpu_power(this, |
2562 | max_load * busiest->__cpu_power); | ||
2538 | else | 2563 | else |
2539 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2564 | tmp = sg_div_cpu_power(this, |
2540 | this->cpu_power; | 2565 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2541 | pwr_move += this->cpu_power * | 2566 | pwr_move += this->__cpu_power * |
2542 | min(this_load_per_task, this_load + tmp); | 2567 | min(this_load_per_task, this_load + tmp); |
2543 | pwr_move /= SCHED_LOAD_SCALE; | 2568 | pwr_move /= SCHED_LOAD_SCALE; |
2544 | 2569 | ||
2545 | /* Move if we gain throughput */ | 2570 | /* Move if we gain throughput */ |
@@ -5501,7 +5526,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5501 | break; | 5526 | break; |
5502 | } | 5527 | } |
5503 | 5528 | ||
5504 | if (!group->cpu_power) { | 5529 | if (!group->__cpu_power) { |
5505 | printk("\n"); | 5530 | printk("\n"); |
5506 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5531 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5507 | "set\n"); | 5532 | "set\n"); |
@@ -5678,7 +5703,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5678 | continue; | 5703 | continue; |
5679 | 5704 | ||
5680 | sg->cpumask = CPU_MASK_NONE; | 5705 | sg->cpumask = CPU_MASK_NONE; |
5681 | sg->cpu_power = 0; | 5706 | sg->__cpu_power = 0; |
5682 | 5707 | ||
5683 | for_each_cpu_mask(j, span) { | 5708 | for_each_cpu_mask(j, span) { |
5684 | if (group_fn(j, cpu_map, NULL) != group) | 5709 | if (group_fn(j, cpu_map, NULL) != group) |
@@ -6367,7 +6392,7 @@ next_sg: | |||
6367 | continue; | 6392 | continue; |
6368 | } | 6393 | } |
6369 | 6394 | ||
6370 | sg->cpu_power += sd->groups->cpu_power; | 6395 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
6371 | } | 6396 | } |
6372 | sg = sg->next; | 6397 | sg = sg->next; |
6373 | if (sg != group_head) | 6398 | if (sg != group_head) |
@@ -6442,6 +6467,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6442 | 6467 | ||
6443 | child = sd->child; | 6468 | child = sd->child; |
6444 | 6469 | ||
6470 | sd->groups->__cpu_power = 0; | ||
6471 | |||
6445 | /* | 6472 | /* |
6446 | * For perf policy, if the groups in child domain share resources | 6473 | * For perf policy, if the groups in child domain share resources |
6447 | * (for example cores sharing some portions of the cache hierarchy | 6474 | * (for example cores sharing some portions of the cache hierarchy |
@@ -6452,18 +6479,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6452 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 6479 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && |
6453 | (child->flags & | 6480 | (child->flags & |
6454 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 6481 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { |
6455 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6482 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); |
6456 | return; | 6483 | return; |
6457 | } | 6484 | } |
6458 | 6485 | ||
6459 | sd->groups->cpu_power = 0; | ||
6460 | |||
6461 | /* | 6486 | /* |
6462 | * add cpu_power of each child group to this groups cpu_power | 6487 | * add cpu_power of each child group to this groups cpu_power |
6463 | */ | 6488 | */ |
6464 | group = child->groups; | 6489 | group = child->groups; |
6465 | do { | 6490 | do { |
6466 | sd->groups->cpu_power += group->cpu_power; | 6491 | sg_inc_cpu_power(sd->groups, group->__cpu_power); |
6467 | group = group->next; | 6492 | group = group->next; |
6468 | } while (group != child->groups); | 6493 | } while (group != child->groups); |
6469 | } | 6494 | } |
@@ -6623,7 +6648,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6623 | sd = &per_cpu(node_domains, j); | 6648 | sd = &per_cpu(node_domains, j); |
6624 | sd->groups = sg; | 6649 | sd->groups = sg; |
6625 | } | 6650 | } |
6626 | sg->cpu_power = 0; | 6651 | sg->__cpu_power = 0; |
6627 | sg->cpumask = nodemask; | 6652 | sg->cpumask = nodemask; |
6628 | sg->next = sg; | 6653 | sg->next = sg; |
6629 | cpus_or(covered, covered, nodemask); | 6654 | cpus_or(covered, covered, nodemask); |
@@ -6651,7 +6676,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6651 | "Can not alloc domain group for node %d\n", j); | 6676 | "Can not alloc domain group for node %d\n", j); |
6652 | goto error; | 6677 | goto error; |
6653 | } | 6678 | } |
6654 | sg->cpu_power = 0; | 6679 | sg->__cpu_power = 0; |
6655 | sg->cpumask = tmp; | 6680 | sg->cpumask = tmp; |
6656 | sg->next = prev->next; | 6681 | sg->next = prev->next; |
6657 | cpus_or(covered, covered, tmp); | 6682 | cpus_or(covered, covered, tmp); |