aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/topology.h1
-rw-r--r--kernel/sched_fair.c139
3 files changed, 126 insertions, 18 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c731296e5e93..ff154e10752b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -801,7 +801,7 @@ enum cpu_idle_type {
801#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ 801#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
802#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 802#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
803#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 803#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
804 804#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
805#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 805#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
806 806
807enum powersavings_balance_level { 807enum powersavings_balance_level {
@@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void)
836 return SD_PREFER_SIBLING; 836 return SD_PREFER_SIBLING;
837} 837}
838 838
839extern int __weak arch_sd_sibiling_asym_packing(void);
840
839/* 841/*
840 * Optimise SD flags for power savings: 842 * Optimise SD flags for power savings:
841 * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. 843 * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c44df50a05ab..cf57f30d0dcb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,6 +103,7 @@ int arch_update_cpu_topology(void);
103 | 1*SD_SHARE_PKG_RESOURCES \ 103 | 1*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \ 104 | 0*SD_SERIALIZE \
105 | 0*SD_PREFER_SIBLING \ 105 | 0*SD_PREFER_SIBLING \
106 | arch_sd_sibiling_asym_packing() \
106 , \ 107 , \
107 .last_balance = jiffies, \ 108 .last_balance = jiffies, \
108 .balance_interval = 1, \ 109 .balance_interval = 1, \
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b9b3462483b7..593424f91a8a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2458,11 +2458,53 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2458} 2458}
2459 2459
2460/** 2460/**
2461 * update_sd_pick_busiest - return 1 on busiest group
2462 * @sd: sched_domain whose statistics are to be checked
2463 * @sds: sched_domain statistics
2464 * @sg: sched_group candidate to be checked for being the busiest
2465 * @sds: sched_group statistics
2466 *
2467 * Determine if @sg is a busier group than the previously selected
2468 * busiest group.
2469 */
2470static bool update_sd_pick_busiest(struct sched_domain *sd,
2471 struct sd_lb_stats *sds,
2472 struct sched_group *sg,
2473 struct sg_lb_stats *sgs,
2474 int this_cpu)
2475{
2476 if (sgs->avg_load <= sds->max_load)
2477 return false;
2478
2479 if (sgs->sum_nr_running > sgs->group_capacity)
2480 return true;
2481
2482 if (sgs->group_imb)
2483 return true;
2484
2485 /*
2486 * ASYM_PACKING needs to move all the work to the lowest
2487 * numbered CPUs in the group, therefore mark all groups
2488 * higher than ourself as busy.
2489 */
2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2491 this_cpu < group_first_cpu(sg)) {
2492 if (!sds->busiest)
2493 return true;
2494
2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2496 return true;
2497 }
2498
2499 return false;
2500}
2501
2502/**
2461 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 2503 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2462 * @sd: sched_domain whose statistics are to be updated. 2504 * @sd: sched_domain whose statistics are to be updated.
2463 * @this_cpu: Cpu for which load balance is currently performed. 2505 * @this_cpu: Cpu for which load balance is currently performed.
2464 * @idle: Idle status of this_cpu 2506 * @idle: Idle status of this_cpu
2465 * @sd_idle: Idle status of the sched_domain containing group. 2507 * @sd_idle: Idle status of the sched_domain containing sg.
2466 * @cpus: Set of cpus considered for load balancing. 2508 * @cpus: Set of cpus considered for load balancing.
2467 * @balance: Should we balance. 2509 * @balance: Should we balance.
2468 * @sds: variable to hold the statistics for this sched_domain. 2510 * @sds: variable to hold the statistics for this sched_domain.
@@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2473 struct sd_lb_stats *sds) 2515 struct sd_lb_stats *sds)
2474{ 2516{
2475 struct sched_domain *child = sd->child; 2517 struct sched_domain *child = sd->child;
2476 struct sched_group *group = sd->groups; 2518 struct sched_group *sg = sd->groups;
2477 struct sg_lb_stats sgs; 2519 struct sg_lb_stats sgs;
2478 int load_idx, prefer_sibling = 0; 2520 int load_idx, prefer_sibling = 0;
2479 2521
@@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2486 do { 2528 do {
2487 int local_group; 2529 int local_group;
2488 2530
2489 local_group = cpumask_test_cpu(this_cpu, 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2490 sched_group_cpus(group));
2491 memset(&sgs, 0, sizeof(sgs)); 2532 memset(&sgs, 0, sizeof(sgs));
2492 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2493 local_group, cpus, balance, &sgs); 2534 local_group, cpus, balance, &sgs);
2494 2535
2495 if (local_group && !(*balance)) 2536 if (local_group && !(*balance))
2496 return; 2537 return;
2497 2538
2498 sds->total_load += sgs.group_load; 2539 sds->total_load += sgs.group_load;
2499 sds->total_pwr += group->cpu_power; 2540 sds->total_pwr += sg->cpu_power;
2500 2541
2501 /* 2542 /*
2502 * In case the child domain prefers tasks go to siblings 2543 * In case the child domain prefers tasks go to siblings
2503 * first, lower the group capacity to one so that we'll try 2544 * first, lower the sg capacity to one so that we'll try
2504 * and move all the excess tasks away. 2545 * and move all the excess tasks away.
2505 */ 2546 */
2506 if (prefer_sibling) 2547 if (prefer_sibling)
@@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2508 2549
2509 if (local_group) { 2550 if (local_group) {
2510 sds->this_load = sgs.avg_load; 2551 sds->this_load = sgs.avg_load;
2511 sds->this = group; 2552 sds->this = sg;
2512 sds->this_nr_running = sgs.sum_nr_running; 2553 sds->this_nr_running = sgs.sum_nr_running;
2513 sds->this_load_per_task = sgs.sum_weighted_load; 2554 sds->this_load_per_task = sgs.sum_weighted_load;
2514 } else if (sgs.avg_load > sds->max_load && 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2515 (sgs.sum_nr_running > sgs.group_capacity ||
2516 sgs.group_imb)) {
2517 sds->max_load = sgs.avg_load; 2556 sds->max_load = sgs.avg_load;
2518 sds->busiest = group; 2557 sds->busiest = sg;
2519 sds->busiest_nr_running = sgs.sum_nr_running; 2558 sds->busiest_nr_running = sgs.sum_nr_running;
2520 sds->busiest_group_capacity = sgs.group_capacity; 2559 sds->busiest_group_capacity = sgs.group_capacity;
2521 sds->busiest_load_per_task = sgs.sum_weighted_load; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load;
2522 sds->group_imb = sgs.group_imb; 2561 sds->group_imb = sgs.group_imb;
2523 } 2562 }
2524 2563
2525 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2526 group = group->next; 2565 sg = sg->next;
2527 } while (group != sd->groups); 2566 } while (sg != sd->groups);
2567}
2568
2569int __weak arch_sd_sibiling_asym_packing(void)
2570{
2571 return 0*SD_ASYM_PACKING;
2572}
2573
2574/**
2575 * check_asym_packing - Check to see if the group is packed into the
2576 * sched doman.
2577 *
2578 * This is primarily intended to used at the sibling level. Some
2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2580 * case of POWER7, it can move to lower SMT modes only when higher
2581 * threads are idle. When in lower SMT modes, the threads will
2582 * perform better since they share less core resources. Hence when we
2583 * have idle threads, we want them to be the higher ones.
2584 *
2585 * This packing function is run on idle threads. It checks to see if
2586 * the busiest CPU in this domain (core in the P7 case) has a higher
2587 * CPU number than the packing function is being run on. Here we are
2588 * assuming lower CPU number will be equivalent to lower a SMT thread
2589 * number.
2590 *
2591 * @sd: The sched_domain whose packing is to be checked.
2592 * @sds: Statistics of the sched_domain which is to be packed
2593 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2594 * @imbalance: returns amount of imbalanced due to packing.
2595 *
2596 * Returns 1 when packing is required and a task should be moved to
2597 * this CPU. The amount of the imbalance is returned in *imbalance.
2598 */
2599static int check_asym_packing(struct sched_domain *sd,
2600 struct sd_lb_stats *sds,
2601 int this_cpu, unsigned long *imbalance)
2602{
2603 int busiest_cpu;
2604
2605 if (!(sd->flags & SD_ASYM_PACKING))
2606 return 0;
2607
2608 if (!sds->busiest)
2609 return 0;
2610
2611 busiest_cpu = group_first_cpu(sds->busiest);
2612 if (this_cpu > busiest_cpu)
2613 return 0;
2614
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2616 SCHED_LOAD_SCALE);
2617 return 1;
2528} 2618}
2529 2619
2530/** 2620/**
@@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2719 if (!(*balance)) 2809 if (!(*balance))
2720 goto ret; 2810 goto ret;
2721 2811
2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2813 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest;
2815
2722 if (!sds.busiest || sds.busiest_nr_running == 0) 2816 if (!sds.busiest || sds.busiest_nr_running == 0)
2723 goto out_balanced; 2817 goto out_balanced;
2724 2818
@@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2808/* Working cpumask for load_balance and load_balance_newidle. */ 2902/* Working cpumask for load_balance and load_balance_newidle. */
2809static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2810 2904
2811static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2906 int busiest_cpu, int this_cpu)
2812{ 2907{
2813 if (idle == CPU_NEWLY_IDLE) { 2908 if (idle == CPU_NEWLY_IDLE) {
2909
2910 /*
2911 * ASYM_PACKING needs to force migrate tasks from busy but
2912 * higher numbered CPUs in order to pack all tasks in the
2913 * lowest numbered CPUs.
2914 */
2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2916 return 1;
2917
2814 /* 2918 /*
2815 * The only task running in a non-idle cpu can be moved to this 2919 * The only task running in a non-idle cpu can be moved to this
2816 * cpu in an attempt to completely freeup the other CPU 2920 * cpu in an attempt to completely freeup the other CPU
@@ -2929,7 +3033,8 @@ redo:
2929 schedstat_inc(sd, lb_failed[idle]); 3033 schedstat_inc(sd, lb_failed[idle]);
2930 sd->nr_balance_failed++; 3034 sd->nr_balance_failed++;
2931 3035
2932 if (need_active_balance(sd, sd_idle, idle)) { 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) {
2933 raw_spin_lock_irqsave(&busiest->lock, flags); 3038 raw_spin_lock_irqsave(&busiest->lock, flags);
2934 3039
2935 /* don't kick the active_load_balance_cpu_stop, 3040 /* don't kick the active_load_balance_cpu_stop,