diff options
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/linux/topology.h | 1 | ||||
-rw-r--r-- | kernel/sched_fair.c | 139 |
3 files changed, 126 insertions, 18 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index c731296e5e93..ff154e10752b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -801,7 +801,7 @@ enum cpu_idle_type { | |||
801 | #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ | 801 | #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ |
802 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ | 802 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
803 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ | 803 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
804 | 804 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ | |
805 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | 805 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ |
806 | 806 | ||
807 | enum powersavings_balance_level { | 807 | enum powersavings_balance_level { |
@@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void) | |||
836 | return SD_PREFER_SIBLING; | 836 | return SD_PREFER_SIBLING; |
837 | } | 837 | } |
838 | 838 | ||
839 | extern int __weak arch_sd_sibiling_asym_packing(void); | ||
840 | |||
839 | /* | 841 | /* |
840 | * Optimise SD flags for power savings: | 842 | * Optimise SD flags for power savings: |
841 | * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. | 843 | * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. |
diff --git a/include/linux/topology.h b/include/linux/topology.h index c44df50a05ab..cf57f30d0dcb 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -103,6 +103,7 @@ int arch_update_cpu_topology(void); | |||
103 | | 1*SD_SHARE_PKG_RESOURCES \ | 103 | | 1*SD_SHARE_PKG_RESOURCES \ |
104 | | 0*SD_SERIALIZE \ | 104 | | 0*SD_SERIALIZE \ |
105 | | 0*SD_PREFER_SIBLING \ | 105 | | 0*SD_PREFER_SIBLING \ |
106 | | arch_sd_sibiling_asym_packing() \ | ||
106 | , \ | 107 | , \ |
107 | .last_balance = jiffies, \ | 108 | .last_balance = jiffies, \ |
108 | .balance_interval = 1, \ | 109 | .balance_interval = 1, \ |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b9b3462483b7..593424f91a8a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -2458,11 +2458,53 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2458 | } | 2458 | } |
2459 | 2459 | ||
2460 | /** | 2460 | /** |
2461 | * update_sd_pick_busiest - return 1 on busiest group | ||
2462 | * @sd: sched_domain whose statistics are to be checked | ||
2463 | * @sds: sched_domain statistics | ||
2464 | * @sg: sched_group candidate to be checked for being the busiest | ||
2465 | * @sds: sched_group statistics | ||
2466 | * | ||
2467 | * Determine if @sg is a busier group than the previously selected | ||
2468 | * busiest group. | ||
2469 | */ | ||
2470 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
2471 | struct sd_lb_stats *sds, | ||
2472 | struct sched_group *sg, | ||
2473 | struct sg_lb_stats *sgs, | ||
2474 | int this_cpu) | ||
2475 | { | ||
2476 | if (sgs->avg_load <= sds->max_load) | ||
2477 | return false; | ||
2478 | |||
2479 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
2480 | return true; | ||
2481 | |||
2482 | if (sgs->group_imb) | ||
2483 | return true; | ||
2484 | |||
2485 | /* | ||
2486 | * ASYM_PACKING needs to move all the work to the lowest | ||
2487 | * numbered CPUs in the group, therefore mark all groups | ||
2488 | * higher than ourself as busy. | ||
2489 | */ | ||
2490 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
2491 | this_cpu < group_first_cpu(sg)) { | ||
2492 | if (!sds->busiest) | ||
2493 | return true; | ||
2494 | |||
2495 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
2496 | return true; | ||
2497 | } | ||
2498 | |||
2499 | return false; | ||
2500 | } | ||
2501 | |||
2502 | /** | ||
2461 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | 2503 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. |
2462 | * @sd: sched_domain whose statistics are to be updated. | 2504 | * @sd: sched_domain whose statistics are to be updated. |
2463 | * @this_cpu: Cpu for which load balance is currently performed. | 2505 | * @this_cpu: Cpu for which load balance is currently performed. |
2464 | * @idle: Idle status of this_cpu | 2506 | * @idle: Idle status of this_cpu |
2465 | * @sd_idle: Idle status of the sched_domain containing group. | 2507 | * @sd_idle: Idle status of the sched_domain containing sg. |
2466 | * @cpus: Set of cpus considered for load balancing. | 2508 | * @cpus: Set of cpus considered for load balancing. |
2467 | * @balance: Should we balance. | 2509 | * @balance: Should we balance. |
2468 | * @sds: variable to hold the statistics for this sched_domain. | 2510 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2473 | struct sd_lb_stats *sds) | 2515 | struct sd_lb_stats *sds) |
2474 | { | 2516 | { |
2475 | struct sched_domain *child = sd->child; | 2517 | struct sched_domain *child = sd->child; |
2476 | struct sched_group *group = sd->groups; | 2518 | struct sched_group *sg = sd->groups; |
2477 | struct sg_lb_stats sgs; | 2519 | struct sg_lb_stats sgs; |
2478 | int load_idx, prefer_sibling = 0; | 2520 | int load_idx, prefer_sibling = 0; |
2479 | 2521 | ||
@@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2486 | do { | 2528 | do { |
2487 | int local_group; | 2529 | int local_group; |
2488 | 2530 | ||
2489 | local_group = cpumask_test_cpu(this_cpu, | 2531 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2490 | sched_group_cpus(group)); | ||
2491 | memset(&sgs, 0, sizeof(sgs)); | 2532 | memset(&sgs, 0, sizeof(sgs)); |
2492 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2533 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
2493 | local_group, cpus, balance, &sgs); | 2534 | local_group, cpus, balance, &sgs); |
2494 | 2535 | ||
2495 | if (local_group && !(*balance)) | 2536 | if (local_group && !(*balance)) |
2496 | return; | 2537 | return; |
2497 | 2538 | ||
2498 | sds->total_load += sgs.group_load; | 2539 | sds->total_load += sgs.group_load; |
2499 | sds->total_pwr += group->cpu_power; | 2540 | sds->total_pwr += sg->cpu_power; |
2500 | 2541 | ||
2501 | /* | 2542 | /* |
2502 | * In case the child domain prefers tasks go to siblings | 2543 | * In case the child domain prefers tasks go to siblings |
2503 | * first, lower the group capacity to one so that we'll try | 2544 | * first, lower the sg capacity to one so that we'll try |
2504 | * and move all the excess tasks away. | 2545 | * and move all the excess tasks away. |
2505 | */ | 2546 | */ |
2506 | if (prefer_sibling) | 2547 | if (prefer_sibling) |
@@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2508 | 2549 | ||
2509 | if (local_group) { | 2550 | if (local_group) { |
2510 | sds->this_load = sgs.avg_load; | 2551 | sds->this_load = sgs.avg_load; |
2511 | sds->this = group; | 2552 | sds->this = sg; |
2512 | sds->this_nr_running = sgs.sum_nr_running; | 2553 | sds->this_nr_running = sgs.sum_nr_running; |
2513 | sds->this_load_per_task = sgs.sum_weighted_load; | 2554 | sds->this_load_per_task = sgs.sum_weighted_load; |
2514 | } else if (sgs.avg_load > sds->max_load && | 2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2515 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2516 | sgs.group_imb)) { | ||
2517 | sds->max_load = sgs.avg_load; | 2556 | sds->max_load = sgs.avg_load; |
2518 | sds->busiest = group; | 2557 | sds->busiest = sg; |
2519 | sds->busiest_nr_running = sgs.sum_nr_running; | 2558 | sds->busiest_nr_running = sgs.sum_nr_running; |
2520 | sds->busiest_group_capacity = sgs.group_capacity; | 2559 | sds->busiest_group_capacity = sgs.group_capacity; |
2521 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2522 | sds->group_imb = sgs.group_imb; | 2561 | sds->group_imb = sgs.group_imb; |
2523 | } | 2562 | } |
2524 | 2563 | ||
2525 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2564 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
2526 | group = group->next; | 2565 | sg = sg->next; |
2527 | } while (group != sd->groups); | 2566 | } while (sg != sd->groups); |
2567 | } | ||
2568 | |||
2569 | int __weak arch_sd_sibiling_asym_packing(void) | ||
2570 | { | ||
2571 | return 0*SD_ASYM_PACKING; | ||
2572 | } | ||
2573 | |||
2574 | /** | ||
2575 | * check_asym_packing - Check to see if the group is packed into the | ||
2576 | * sched doman. | ||
2577 | * | ||
2578 | * This is primarily intended to used at the sibling level. Some | ||
2579 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
2580 | * case of POWER7, it can move to lower SMT modes only when higher | ||
2581 | * threads are idle. When in lower SMT modes, the threads will | ||
2582 | * perform better since they share less core resources. Hence when we | ||
2583 | * have idle threads, we want them to be the higher ones. | ||
2584 | * | ||
2585 | * This packing function is run on idle threads. It checks to see if | ||
2586 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
2587 | * CPU number than the packing function is being run on. Here we are | ||
2588 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
2589 | * number. | ||
2590 | * | ||
2591 | * @sd: The sched_domain whose packing is to be checked. | ||
2592 | * @sds: Statistics of the sched_domain which is to be packed | ||
2593 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2594 | * @imbalance: returns amount of imbalanced due to packing. | ||
2595 | * | ||
2596 | * Returns 1 when packing is required and a task should be moved to | ||
2597 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
2598 | */ | ||
2599 | static int check_asym_packing(struct sched_domain *sd, | ||
2600 | struct sd_lb_stats *sds, | ||
2601 | int this_cpu, unsigned long *imbalance) | ||
2602 | { | ||
2603 | int busiest_cpu; | ||
2604 | |||
2605 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
2606 | return 0; | ||
2607 | |||
2608 | if (!sds->busiest) | ||
2609 | return 0; | ||
2610 | |||
2611 | busiest_cpu = group_first_cpu(sds->busiest); | ||
2612 | if (this_cpu > busiest_cpu) | ||
2613 | return 0; | ||
2614 | |||
2615 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
2616 | SCHED_LOAD_SCALE); | ||
2617 | return 1; | ||
2528 | } | 2618 | } |
2529 | 2619 | ||
2530 | /** | 2620 | /** |
@@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2719 | if (!(*balance)) | 2809 | if (!(*balance)) |
2720 | goto ret; | 2810 | goto ret; |
2721 | 2811 | ||
2812 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
2813 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
2814 | return sds.busiest; | ||
2815 | |||
2722 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2816 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2723 | goto out_balanced; | 2817 | goto out_balanced; |
2724 | 2818 | ||
@@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2808 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2902 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2809 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2903 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2810 | 2904 | ||
2811 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2905 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
2906 | int busiest_cpu, int this_cpu) | ||
2812 | { | 2907 | { |
2813 | if (idle == CPU_NEWLY_IDLE) { | 2908 | if (idle == CPU_NEWLY_IDLE) { |
2909 | |||
2910 | /* | ||
2911 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
2912 | * higher numbered CPUs in order to pack all tasks in the | ||
2913 | * lowest numbered CPUs. | ||
2914 | */ | ||
2915 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
2916 | return 1; | ||
2917 | |||
2814 | /* | 2918 | /* |
2815 | * The only task running in a non-idle cpu can be moved to this | 2919 | * The only task running in a non-idle cpu can be moved to this |
2816 | * cpu in an attempt to completely freeup the other CPU | 2920 | * cpu in an attempt to completely freeup the other CPU |
@@ -2929,7 +3033,8 @@ redo: | |||
2929 | schedstat_inc(sd, lb_failed[idle]); | 3033 | schedstat_inc(sd, lb_failed[idle]); |
2930 | sd->nr_balance_failed++; | 3034 | sd->nr_balance_failed++; |
2931 | 3035 | ||
2932 | if (need_active_balance(sd, sd_idle, idle)) { | 3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
3037 | this_cpu)) { | ||
2933 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3038 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2934 | 3039 | ||
2935 | /* don't kick the active_load_balance_cpu_stop, | 3040 | /* don't kick the active_load_balance_cpu_stop, |