aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMichael Neuling <mikey@neuling.org>2010-06-08 00:57:02 -0400
committerIngo Molnar <mingo@elte.hu>2010-06-09 04:34:55 -0400
commit532cb4c401e225b084c14d6bd6a2f8ee561de2f1 (patch)
tree0ce57c2e21cd12ee05561ab2b9c4b66729da8e5a /kernel
parent9d5efe05eb0c904545a28b19c18b949f23334de0 (diff)
sched: Add asymmetric group packing option for sibling domain
Check to see if the group is packed in a sched doman. This is primarily intended to used at the sibling level. Some cores like POWER7 prefer to use lower numbered SMT threads. In the case of POWER7, it can move to lower SMT modes only when higher threads are idle. When in lower SMT modes, the threads will perform better since they share less core resources. Hence when we have idle threads, we want them to be the higher ones. This adds a hook into f_b_g() called check_asym_packing() to check the packing. This packing function is run on idle threads. It checks to see if the busiest CPU in this domain (core in the P7 case) has a higher CPU number than what where the packing function is being run on. If it is, calculate the imbalance and return the higher busier thread as the busiest group to f_b_g(). Here we are assuming a lower CPU number will be equivalent to a lower SMT thread number. It also creates a new SD_ASYM_PACKING flag to enable this feature at any scheduler domain level. It also creates an arch hook to enable this feature at the sibling level. The default function doesn't enable this feature. Based heavily on patch from Peter Zijlstra. Fixes from Srivatsa Vaddagiri. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched_fair.c139
1 files changed, 122 insertions, 17 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b9b3462483b7..593424f91a8a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2458,11 +2458,53 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2458} 2458}
2459 2459
2460/** 2460/**
2461 * update_sd_pick_busiest - return 1 on busiest group
2462 * @sd: sched_domain whose statistics are to be checked
2463 * @sds: sched_domain statistics
2464 * @sg: sched_group candidate to be checked for being the busiest
2465 * @sds: sched_group statistics
2466 *
2467 * Determine if @sg is a busier group than the previously selected
2468 * busiest group.
2469 */
2470static bool update_sd_pick_busiest(struct sched_domain *sd,
2471 struct sd_lb_stats *sds,
2472 struct sched_group *sg,
2473 struct sg_lb_stats *sgs,
2474 int this_cpu)
2475{
2476 if (sgs->avg_load <= sds->max_load)
2477 return false;
2478
2479 if (sgs->sum_nr_running > sgs->group_capacity)
2480 return true;
2481
2482 if (sgs->group_imb)
2483 return true;
2484
2485 /*
2486 * ASYM_PACKING needs to move all the work to the lowest
2487 * numbered CPUs in the group, therefore mark all groups
2488 * higher than ourself as busy.
2489 */
2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2491 this_cpu < group_first_cpu(sg)) {
2492 if (!sds->busiest)
2493 return true;
2494
2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2496 return true;
2497 }
2498
2499 return false;
2500}
2501
2502/**
2461 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 2503 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2462 * @sd: sched_domain whose statistics are to be updated. 2504 * @sd: sched_domain whose statistics are to be updated.
2463 * @this_cpu: Cpu for which load balance is currently performed. 2505 * @this_cpu: Cpu for which load balance is currently performed.
2464 * @idle: Idle status of this_cpu 2506 * @idle: Idle status of this_cpu
2465 * @sd_idle: Idle status of the sched_domain containing group. 2507 * @sd_idle: Idle status of the sched_domain containing sg.
2466 * @cpus: Set of cpus considered for load balancing. 2508 * @cpus: Set of cpus considered for load balancing.
2467 * @balance: Should we balance. 2509 * @balance: Should we balance.
2468 * @sds: variable to hold the statistics for this sched_domain. 2510 * @sds: variable to hold the statistics for this sched_domain.
@@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2473 struct sd_lb_stats *sds) 2515 struct sd_lb_stats *sds)
2474{ 2516{
2475 struct sched_domain *child = sd->child; 2517 struct sched_domain *child = sd->child;
2476 struct sched_group *group = sd->groups; 2518 struct sched_group *sg = sd->groups;
2477 struct sg_lb_stats sgs; 2519 struct sg_lb_stats sgs;
2478 int load_idx, prefer_sibling = 0; 2520 int load_idx, prefer_sibling = 0;
2479 2521
@@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2486 do { 2528 do {
2487 int local_group; 2529 int local_group;
2488 2530
2489 local_group = cpumask_test_cpu(this_cpu, 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2490 sched_group_cpus(group));
2491 memset(&sgs, 0, sizeof(sgs)); 2532 memset(&sgs, 0, sizeof(sgs));
2492 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2493 local_group, cpus, balance, &sgs); 2534 local_group, cpus, balance, &sgs);
2494 2535
2495 if (local_group && !(*balance)) 2536 if (local_group && !(*balance))
2496 return; 2537 return;
2497 2538
2498 sds->total_load += sgs.group_load; 2539 sds->total_load += sgs.group_load;
2499 sds->total_pwr += group->cpu_power; 2540 sds->total_pwr += sg->cpu_power;
2500 2541
2501 /* 2542 /*
2502 * In case the child domain prefers tasks go to siblings 2543 * In case the child domain prefers tasks go to siblings
2503 * first, lower the group capacity to one so that we'll try 2544 * first, lower the sg capacity to one so that we'll try
2504 * and move all the excess tasks away. 2545 * and move all the excess tasks away.
2505 */ 2546 */
2506 if (prefer_sibling) 2547 if (prefer_sibling)
@@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2508 2549
2509 if (local_group) { 2550 if (local_group) {
2510 sds->this_load = sgs.avg_load; 2551 sds->this_load = sgs.avg_load;
2511 sds->this = group; 2552 sds->this = sg;
2512 sds->this_nr_running = sgs.sum_nr_running; 2553 sds->this_nr_running = sgs.sum_nr_running;
2513 sds->this_load_per_task = sgs.sum_weighted_load; 2554 sds->this_load_per_task = sgs.sum_weighted_load;
2514 } else if (sgs.avg_load > sds->max_load && 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2515 (sgs.sum_nr_running > sgs.group_capacity ||
2516 sgs.group_imb)) {
2517 sds->max_load = sgs.avg_load; 2556 sds->max_load = sgs.avg_load;
2518 sds->busiest = group; 2557 sds->busiest = sg;
2519 sds->busiest_nr_running = sgs.sum_nr_running; 2558 sds->busiest_nr_running = sgs.sum_nr_running;
2520 sds->busiest_group_capacity = sgs.group_capacity; 2559 sds->busiest_group_capacity = sgs.group_capacity;
2521 sds->busiest_load_per_task = sgs.sum_weighted_load; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load;
2522 sds->group_imb = sgs.group_imb; 2561 sds->group_imb = sgs.group_imb;
2523 } 2562 }
2524 2563
2525 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2526 group = group->next; 2565 sg = sg->next;
2527 } while (group != sd->groups); 2566 } while (sg != sd->groups);
2567}
2568
2569int __weak arch_sd_sibiling_asym_packing(void)
2570{
2571 return 0*SD_ASYM_PACKING;
2572}
2573
2574/**
2575 * check_asym_packing - Check to see if the group is packed into the
2576 * sched doman.
2577 *
2578 * This is primarily intended to used at the sibling level. Some
2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2580 * case of POWER7, it can move to lower SMT modes only when higher
2581 * threads are idle. When in lower SMT modes, the threads will
2582 * perform better since they share less core resources. Hence when we
2583 * have idle threads, we want them to be the higher ones.
2584 *
2585 * This packing function is run on idle threads. It checks to see if
2586 * the busiest CPU in this domain (core in the P7 case) has a higher
2587 * CPU number than the packing function is being run on. Here we are
2588 * assuming lower CPU number will be equivalent to lower a SMT thread
2589 * number.
2590 *
2591 * @sd: The sched_domain whose packing is to be checked.
2592 * @sds: Statistics of the sched_domain which is to be packed
2593 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2594 * @imbalance: returns amount of imbalanced due to packing.
2595 *
2596 * Returns 1 when packing is required and a task should be moved to
2597 * this CPU. The amount of the imbalance is returned in *imbalance.
2598 */
2599static int check_asym_packing(struct sched_domain *sd,
2600 struct sd_lb_stats *sds,
2601 int this_cpu, unsigned long *imbalance)
2602{
2603 int busiest_cpu;
2604
2605 if (!(sd->flags & SD_ASYM_PACKING))
2606 return 0;
2607
2608 if (!sds->busiest)
2609 return 0;
2610
2611 busiest_cpu = group_first_cpu(sds->busiest);
2612 if (this_cpu > busiest_cpu)
2613 return 0;
2614
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2616 SCHED_LOAD_SCALE);
2617 return 1;
2528} 2618}
2529 2619
2530/** 2620/**
@@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2719 if (!(*balance)) 2809 if (!(*balance))
2720 goto ret; 2810 goto ret;
2721 2811
2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2813 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest;
2815
2722 if (!sds.busiest || sds.busiest_nr_running == 0) 2816 if (!sds.busiest || sds.busiest_nr_running == 0)
2723 goto out_balanced; 2817 goto out_balanced;
2724 2818
@@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2808/* Working cpumask for load_balance and load_balance_newidle. */ 2902/* Working cpumask for load_balance and load_balance_newidle. */
2809static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2810 2904
2811static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2906 int busiest_cpu, int this_cpu)
2812{ 2907{
2813 if (idle == CPU_NEWLY_IDLE) { 2908 if (idle == CPU_NEWLY_IDLE) {
2909
2910 /*
2911 * ASYM_PACKING needs to force migrate tasks from busy but
2912 * higher numbered CPUs in order to pack all tasks in the
2913 * lowest numbered CPUs.
2914 */
2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2916 return 1;
2917
2814 /* 2918 /*
2815 * The only task running in a non-idle cpu can be moved to this 2919 * The only task running in a non-idle cpu can be moved to this
2816 * cpu in an attempt to completely freeup the other CPU 2920 * cpu in an attempt to completely freeup the other CPU
@@ -2929,7 +3033,8 @@ redo:
2929 schedstat_inc(sd, lb_failed[idle]); 3033 schedstat_inc(sd, lb_failed[idle]);
2930 sd->nr_balance_failed++; 3034 sd->nr_balance_failed++;
2931 3035
2932 if (need_active_balance(sd, sd_idle, idle)) { 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) {
2933 raw_spin_lock_irqsave(&busiest->lock, flags); 3038 raw_spin_lock_irqsave(&busiest->lock, flags);
2934 3039
2935 /* don't kick the active_load_balance_cpu_stop, 3040 /* don't kick the active_load_balance_cpu_stop,