aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-05-08 03:32:57 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-08 14:15:17 -0400
commit5517d86bea237c1d7078840182d9ebc0fe4c1afc (patch)
tree67f1999895313878bfa904c66dffb7066f3c8d91
parent46cb4b7c88fa5517f64b5bee42939ea3614cddcb (diff)
Speed up divides by cpu_power in scheduler
I noticed expensive divides done in try_to_wakeup() and find_busiest_group() on a bi dual core Opteron machine (total of 4 cores), moderatly loaded (15.000 context switch per second) oprofile numbers : CPU: AMD64 processors, speed 2600.05 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 50000 samples % symbol name ... 613914 1.0498 try_to_wake_up 834 0.0013 :ffffffff80227ae1: div %rcx 77513 0.1191 :ffffffff80227ae4: mov %rax,%r11 608893 1.0413 find_busiest_group 1841 0.0031 :ffffffff802260bf: div %rdi 140109 0.2394 :ffffffff802260c2: test %sil,%sil Some of these divides can use the reciprocal divides we introduced some time ago (currently used in slab AFAIK) We can assume a load will fit in a 32bits number, because with a SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432 When/if we reach this limit one day, probably cpus will have a fast hardware divide and we can zap the reciprocal divide trick. Ingo suggested to rename cpu_power to __cpu_power to make clear it should not be modified without changing its reciprocal value too. I did not convert the divide in cpu_avg_load_per_task(), because tracking nr_running changes may be not worth it ? We could use a static table of 32 reciprocal values but it would add a conditional branch and table lookup. [akpm@linux-foundation.org: !SMP build fix] Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/sched.c83
2 files changed, 61 insertions, 30 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 15ab3e039535..3d95c480f58d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -680,8 +680,14 @@ struct sched_group {
680 /* 680 /*
681 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 681 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
682 * single CPU. This is read only (except for setup, hotplug CPU). 682 * single CPU. This is read only (except for setup, hotplug CPU).
683 * Note : Never change cpu_power without recompute its reciprocal
683 */ 684 */
684 unsigned long cpu_power; 685 unsigned int __cpu_power;
686 /*
687 * reciprocal value of cpu_power to avoid expensive divides
688 * (see include/linux/reciprocal_div.h)
689 */
690 u32 reciprocal_cpu_power;
685}; 691};
686 692
687struct sched_domain { 693struct sched_domain {
diff --git a/kernel/sched.c b/kernel/sched.c
index 74599286230c..e4a5888549a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
52#include <linux/tsacct_kern.h> 52#include <linux/tsacct_kern.h>
53#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h> 54#include <linux/delayacct.h>
55#include <asm/tlb.h> 55#include <linux/reciprocal_div.h>
56 56
57#include <asm/tlb.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
58 59
59/* 60/*
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
181 return SCALE_PRIO(DEF_TIMESLICE, static_prio); 182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
182} 183}
183 184
185#ifdef CONFIG_SMP
186/*
187 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
188 * Since cpu_power is a 'constant', we can use a reciprocal divide.
189 */
190static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
191{
192 return reciprocal_divide(load, sg->reciprocal_cpu_power);
193}
194
195/*
196 * Each time a sched group cpu_power is changed,
197 * we must compute its reciprocal value
198 */
199static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
200{
201 sg->__cpu_power += val;
202 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
203}
204#endif
205
184/* 206/*
185 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
186 * to time slice values: [800ms ... 100ms ... 5ms] 208 * to time slice values: [800ms ... 100ms ... 5ms]
@@ -1256,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1256 } 1278 }
1257 1279
1258 /* Adjust by relative CPU power of the group */ 1280 /* Adjust by relative CPU power of the group */
1259 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1281 avg_load = sg_div_cpu_power(group,
1282 avg_load * SCHED_LOAD_SCALE);
1260 1283
1261 if (local_group) { 1284 if (local_group) {
1262 this_load = avg_load; 1285 this_load = avg_load;
@@ -2367,12 +2390,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2367 } 2390 }
2368 2391
2369 total_load += avg_load; 2392 total_load += avg_load;
2370 total_pwr += group->cpu_power; 2393 total_pwr += group->__cpu_power;
2371 2394
2372 /* Adjust by relative CPU power of the group */ 2395 /* Adjust by relative CPU power of the group */
2373 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2396 avg_load = sg_div_cpu_power(group,
2397 avg_load * SCHED_LOAD_SCALE);
2374 2398
2375 group_capacity = group->cpu_power / SCHED_LOAD_SCALE; 2399 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2376 2400
2377 if (local_group) { 2401 if (local_group) {
2378 this_load = avg_load; 2402 this_load = avg_load;
@@ -2483,8 +2507,8 @@ group_next:
2483 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 2507 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2484 2508
2485 /* How much load to actually move to equalise the imbalance */ 2509 /* How much load to actually move to equalise the imbalance */
2486 *imbalance = min(max_pull * busiest->cpu_power, 2510 *imbalance = min(max_pull * busiest->__cpu_power,
2487 (avg_load - this_load) * this->cpu_power) 2511 (avg_load - this_load) * this->__cpu_power)
2488 / SCHED_LOAD_SCALE; 2512 / SCHED_LOAD_SCALE;
2489 2513
2490 /* 2514 /*
@@ -2518,28 +2542,29 @@ small_imbalance:
2518 * moving them. 2542 * moving them.
2519 */ 2543 */
2520 2544
2521 pwr_now += busiest->cpu_power * 2545 pwr_now += busiest->__cpu_power *
2522 min(busiest_load_per_task, max_load); 2546 min(busiest_load_per_task, max_load);
2523 pwr_now += this->cpu_power * 2547 pwr_now += this->__cpu_power *
2524 min(this_load_per_task, this_load); 2548 min(this_load_per_task, this_load);
2525 pwr_now /= SCHED_LOAD_SCALE; 2549 pwr_now /= SCHED_LOAD_SCALE;
2526 2550
2527 /* Amount of load we'd subtract */ 2551 /* Amount of load we'd subtract */
2528 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2552 tmp = sg_div_cpu_power(busiest,
2529 busiest->cpu_power; 2553 busiest_load_per_task * SCHED_LOAD_SCALE);
2530 if (max_load > tmp) 2554 if (max_load > tmp)
2531 pwr_move += busiest->cpu_power * 2555 pwr_move += busiest->__cpu_power *
2532 min(busiest_load_per_task, max_load - tmp); 2556 min(busiest_load_per_task, max_load - tmp);
2533 2557
2534 /* Amount of load we'd add */ 2558 /* Amount of load we'd add */
2535 if (max_load * busiest->cpu_power < 2559 if (max_load * busiest->__cpu_power <
2536 busiest_load_per_task * SCHED_LOAD_SCALE) 2560 busiest_load_per_task * SCHED_LOAD_SCALE)
2537 tmp = max_load * busiest->cpu_power / this->cpu_power; 2561 tmp = sg_div_cpu_power(this,
2562 max_load * busiest->__cpu_power);
2538 else 2563 else
2539 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2564 tmp = sg_div_cpu_power(this,
2540 this->cpu_power; 2565 busiest_load_per_task * SCHED_LOAD_SCALE);
2541 pwr_move += this->cpu_power * 2566 pwr_move += this->__cpu_power *
2542 min(this_load_per_task, this_load + tmp); 2567 min(this_load_per_task, this_load + tmp);
2543 pwr_move /= SCHED_LOAD_SCALE; 2568 pwr_move /= SCHED_LOAD_SCALE;
2544 2569
2545 /* Move if we gain throughput */ 2570 /* Move if we gain throughput */
@@ -5501,7 +5526,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5501 break; 5526 break;
5502 } 5527 }
5503 5528
5504 if (!group->cpu_power) { 5529 if (!group->__cpu_power) {
5505 printk("\n"); 5530 printk("\n");
5506 printk(KERN_ERR "ERROR: domain->cpu_power not " 5531 printk(KERN_ERR "ERROR: domain->cpu_power not "
5507 "set\n"); 5532 "set\n");
@@ -5678,7 +5703,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5678 continue; 5703 continue;
5679 5704
5680 sg->cpumask = CPU_MASK_NONE; 5705 sg->cpumask = CPU_MASK_NONE;
5681 sg->cpu_power = 0; 5706 sg->__cpu_power = 0;
5682 5707
5683 for_each_cpu_mask(j, span) { 5708 for_each_cpu_mask(j, span) {
5684 if (group_fn(j, cpu_map, NULL) != group) 5709 if (group_fn(j, cpu_map, NULL) != group)
@@ -6367,7 +6392,7 @@ next_sg:
6367 continue; 6392 continue;
6368 } 6393 }
6369 6394
6370 sg->cpu_power += sd->groups->cpu_power; 6395 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6371 } 6396 }
6372 sg = sg->next; 6397 sg = sg->next;
6373 if (sg != group_head) 6398 if (sg != group_head)
@@ -6442,6 +6467,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6442 6467
6443 child = sd->child; 6468 child = sd->child;
6444 6469
6470 sd->groups->__cpu_power = 0;
6471
6445 /* 6472 /*
6446 * For perf policy, if the groups in child domain share resources 6473 * For perf policy, if the groups in child domain share resources
6447 * (for example cores sharing some portions of the cache hierarchy 6474 * (for example cores sharing some portions of the cache hierarchy
@@ -6452,18 +6479,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6452 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 6479 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6453 (child->flags & 6480 (child->flags &
6454 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 6481 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6455 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6482 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6456 return; 6483 return;
6457 } 6484 }
6458 6485
6459 sd->groups->cpu_power = 0;
6460
6461 /* 6486 /*
6462 * add cpu_power of each child group to this groups cpu_power 6487 * add cpu_power of each child group to this groups cpu_power
6463 */ 6488 */
6464 group = child->groups; 6489 group = child->groups;
6465 do { 6490 do {
6466 sd->groups->cpu_power += group->cpu_power; 6491 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6467 group = group->next; 6492 group = group->next;
6468 } while (group != child->groups); 6493 } while (group != child->groups);
6469} 6494}
@@ -6623,7 +6648,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6623 sd = &per_cpu(node_domains, j); 6648 sd = &per_cpu(node_domains, j);
6624 sd->groups = sg; 6649 sd->groups = sg;
6625 } 6650 }
6626 sg->cpu_power = 0; 6651 sg->__cpu_power = 0;
6627 sg->cpumask = nodemask; 6652 sg->cpumask = nodemask;
6628 sg->next = sg; 6653 sg->next = sg;
6629 cpus_or(covered, covered, nodemask); 6654 cpus_or(covered, covered, nodemask);
@@ -6651,7 +6676,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6651 "Can not alloc domain group for node %d\n", j); 6676 "Can not alloc domain group for node %d\n", j);
6652 goto error; 6677 goto error;
6653 } 6678 }
6654 sg->cpu_power = 0; 6679 sg->__cpu_power = 0;
6655 sg->cpumask = tmp; 6680 sg->cpumask = tmp;
6656 sg->next = prev->next; 6681 sg->next = prev->next;
6657 cpus_or(covered, covered, tmp); 6682 cpus_or(covered, covered, tmp);