diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 532 |
1 files changed, 371 insertions, 161 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..806d1b227a21 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2287 | unsigned long power = SCHED_LOAD_SCALE; | 2287 | unsigned long power = SCHED_LOAD_SCALE; |
2288 | struct sched_group *sdg = sd->groups; | 2288 | struct sched_group *sdg = sd->groups; |
2289 | 2289 | ||
2290 | if (sched_feat(ARCH_POWER)) | ||
2291 | power *= arch_scale_freq_power(sd, cpu); | ||
2292 | else | ||
2293 | power *= default_scale_freq_power(sd, cpu); | ||
2294 | |||
2295 | power >>= SCHED_LOAD_SHIFT; | ||
2296 | |||
2297 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2290 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
2298 | if (sched_feat(ARCH_POWER)) | 2291 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_smt_power(sd, cpu); | 2292 | power *= arch_scale_smt_power(sd, cpu); |
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2303 | power >>= SCHED_LOAD_SHIFT; | 2296 | power >>= SCHED_LOAD_SHIFT; |
2304 | } | 2297 | } |
2305 | 2298 | ||
2299 | sdg->cpu_power_orig = power; | ||
2300 | |||
2301 | if (sched_feat(ARCH_POWER)) | ||
2302 | power *= arch_scale_freq_power(sd, cpu); | ||
2303 | else | ||
2304 | power *= default_scale_freq_power(sd, cpu); | ||
2305 | |||
2306 | power >>= SCHED_LOAD_SHIFT; | ||
2307 | |||
2306 | power *= scale_rt_power(cpu); | 2308 | power *= scale_rt_power(cpu); |
2307 | power >>= SCHED_LOAD_SHIFT; | 2309 | power >>= SCHED_LOAD_SHIFT; |
2308 | 2310 | ||
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2335 | sdg->cpu_power = power; | 2337 | sdg->cpu_power = power; |
2336 | } | 2338 | } |
2337 | 2339 | ||
2340 | /* | ||
2341 | * Try and fix up capacity for tiny siblings, this is needed when | ||
2342 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | ||
2343 | * which on its own isn't powerful enough. | ||
2344 | * | ||
2345 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
2346 | */ | ||
2347 | static inline int | ||
2348 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | ||
2349 | { | ||
2350 | /* | ||
2351 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | ||
2352 | */ | ||
2353 | if (sd->level != SD_LV_SIBLING) | ||
2354 | return 0; | ||
2355 | |||
2356 | /* | ||
2357 | * If ~90% of the cpu_power is still there, we're good. | ||
2358 | */ | ||
2359 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | ||
2360 | return 1; | ||
2361 | |||
2362 | return 0; | ||
2363 | } | ||
2364 | |||
2338 | /** | 2365 | /** |
2339 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 2366 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
2340 | * @sd: The sched_domain whose statistics are to be updated. | 2367 | * @sd: The sched_domain whose statistics are to be updated. |
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2400 | * domains. In the newly idle case, we will allow all the cpu's | 2427 | * domains. In the newly idle case, we will allow all the cpu's |
2401 | * to do the newly idle load balance. | 2428 | * to do the newly idle load balance. |
2402 | */ | 2429 | */ |
2403 | if (idle != CPU_NEWLY_IDLE && local_group && | 2430 | if (idle != CPU_NEWLY_IDLE && local_group) { |
2404 | balance_cpu != this_cpu) { | 2431 | if (balance_cpu != this_cpu) { |
2405 | *balance = 0; | 2432 | *balance = 0; |
2406 | return; | 2433 | return; |
2434 | } | ||
2435 | update_group_power(sd, this_cpu); | ||
2407 | } | 2436 | } |
2408 | 2437 | ||
2409 | update_group_power(sd, this_cpu); | ||
2410 | |||
2411 | /* Adjust by relative CPU power of the group */ | 2438 | /* Adjust by relative CPU power of the group */ |
2412 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2439 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2413 | 2440 | ||
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2428 | 2455 | ||
2429 | sgs->group_capacity = | 2456 | sgs->group_capacity = |
2430 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2457 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
2458 | if (!sgs->group_capacity) | ||
2459 | sgs->group_capacity = fix_small_capacity(sd, group); | ||
2460 | } | ||
2461 | |||
2462 | /** | ||
2463 | * update_sd_pick_busiest - return 1 on busiest group | ||
2464 | * @sd: sched_domain whose statistics are to be checked | ||
2465 | * @sds: sched_domain statistics | ||
2466 | * @sg: sched_group candidate to be checked for being the busiest | ||
2467 | * @sgs: sched_group statistics | ||
2468 | * @this_cpu: the current cpu | ||
2469 | * | ||
2470 | * Determine if @sg is a busier group than the previously selected | ||
2471 | * busiest group. | ||
2472 | */ | ||
2473 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
2474 | struct sd_lb_stats *sds, | ||
2475 | struct sched_group *sg, | ||
2476 | struct sg_lb_stats *sgs, | ||
2477 | int this_cpu) | ||
2478 | { | ||
2479 | if (sgs->avg_load <= sds->max_load) | ||
2480 | return false; | ||
2481 | |||
2482 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
2483 | return true; | ||
2484 | |||
2485 | if (sgs->group_imb) | ||
2486 | return true; | ||
2487 | |||
2488 | /* | ||
2489 | * ASYM_PACKING needs to move all the work to the lowest | ||
2490 | * numbered CPUs in the group, therefore mark all groups | ||
2491 | * higher than ourself as busy. | ||
2492 | */ | ||
2493 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
2494 | this_cpu < group_first_cpu(sg)) { | ||
2495 | if (!sds->busiest) | ||
2496 | return true; | ||
2497 | |||
2498 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
2499 | return true; | ||
2500 | } | ||
2501 | |||
2502 | return false; | ||
2431 | } | 2503 | } |
2432 | 2504 | ||
2433 | /** | 2505 | /** |
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2435 | * @sd: sched_domain whose statistics are to be updated. | 2507 | * @sd: sched_domain whose statistics are to be updated. |
2436 | * @this_cpu: Cpu for which load balance is currently performed. | 2508 | * @this_cpu: Cpu for which load balance is currently performed. |
2437 | * @idle: Idle status of this_cpu | 2509 | * @idle: Idle status of this_cpu |
2438 | * @sd_idle: Idle status of the sched_domain containing group. | 2510 | * @sd_idle: Idle status of the sched_domain containing sg. |
2439 | * @cpus: Set of cpus considered for load balancing. | 2511 | * @cpus: Set of cpus considered for load balancing. |
2440 | * @balance: Should we balance. | 2512 | * @balance: Should we balance. |
2441 | * @sds: variable to hold the statistics for this sched_domain. | 2513 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2446 | struct sd_lb_stats *sds) | 2518 | struct sd_lb_stats *sds) |
2447 | { | 2519 | { |
2448 | struct sched_domain *child = sd->child; | 2520 | struct sched_domain *child = sd->child; |
2449 | struct sched_group *group = sd->groups; | 2521 | struct sched_group *sg = sd->groups; |
2450 | struct sg_lb_stats sgs; | 2522 | struct sg_lb_stats sgs; |
2451 | int load_idx, prefer_sibling = 0; | 2523 | int load_idx, prefer_sibling = 0; |
2452 | 2524 | ||
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2459 | do { | 2531 | do { |
2460 | int local_group; | 2532 | int local_group; |
2461 | 2533 | ||
2462 | local_group = cpumask_test_cpu(this_cpu, | 2534 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2463 | sched_group_cpus(group)); | ||
2464 | memset(&sgs, 0, sizeof(sgs)); | 2535 | memset(&sgs, 0, sizeof(sgs)); |
2465 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2536 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
2466 | local_group, cpus, balance, &sgs); | 2537 | local_group, cpus, balance, &sgs); |
2467 | 2538 | ||
2468 | if (local_group && !(*balance)) | 2539 | if (local_group && !(*balance)) |
2469 | return; | 2540 | return; |
2470 | 2541 | ||
2471 | sds->total_load += sgs.group_load; | 2542 | sds->total_load += sgs.group_load; |
2472 | sds->total_pwr += group->cpu_power; | 2543 | sds->total_pwr += sg->cpu_power; |
2473 | 2544 | ||
2474 | /* | 2545 | /* |
2475 | * In case the child domain prefers tasks go to siblings | 2546 | * In case the child domain prefers tasks go to siblings |
2476 | * first, lower the group capacity to one so that we'll try | 2547 | * first, lower the sg capacity to one so that we'll try |
2477 | * and move all the excess tasks away. | 2548 | * and move all the excess tasks away. |
2478 | */ | 2549 | */ |
2479 | if (prefer_sibling) | 2550 | if (prefer_sibling) |
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2481 | 2552 | ||
2482 | if (local_group) { | 2553 | if (local_group) { |
2483 | sds->this_load = sgs.avg_load; | 2554 | sds->this_load = sgs.avg_load; |
2484 | sds->this = group; | 2555 | sds->this = sg; |
2485 | sds->this_nr_running = sgs.sum_nr_running; | 2556 | sds->this_nr_running = sgs.sum_nr_running; |
2486 | sds->this_load_per_task = sgs.sum_weighted_load; | 2557 | sds->this_load_per_task = sgs.sum_weighted_load; |
2487 | } else if (sgs.avg_load > sds->max_load && | 2558 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2488 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2489 | sgs.group_imb)) { | ||
2490 | sds->max_load = sgs.avg_load; | 2559 | sds->max_load = sgs.avg_load; |
2491 | sds->busiest = group; | 2560 | sds->busiest = sg; |
2492 | sds->busiest_nr_running = sgs.sum_nr_running; | 2561 | sds->busiest_nr_running = sgs.sum_nr_running; |
2493 | sds->busiest_group_capacity = sgs.group_capacity; | 2562 | sds->busiest_group_capacity = sgs.group_capacity; |
2494 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2563 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2495 | sds->group_imb = sgs.group_imb; | 2564 | sds->group_imb = sgs.group_imb; |
2496 | } | 2565 | } |
2497 | 2566 | ||
2498 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2567 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
2499 | group = group->next; | 2568 | sg = sg->next; |
2500 | } while (group != sd->groups); | 2569 | } while (sg != sd->groups); |
2570 | } | ||
2571 | |||
2572 | int __weak arch_sd_sibling_asym_packing(void) | ||
2573 | { | ||
2574 | return 0*SD_ASYM_PACKING; | ||
2575 | } | ||
2576 | |||
2577 | /** | ||
2578 | * check_asym_packing - Check to see if the group is packed into the | ||
2579 | * sched doman. | ||
2580 | * | ||
2581 | * This is primarily intended to used at the sibling level. Some | ||
2582 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
2583 | * case of POWER7, it can move to lower SMT modes only when higher | ||
2584 | * threads are idle. When in lower SMT modes, the threads will | ||
2585 | * perform better since they share less core resources. Hence when we | ||
2586 | * have idle threads, we want them to be the higher ones. | ||
2587 | * | ||
2588 | * This packing function is run on idle threads. It checks to see if | ||
2589 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
2590 | * CPU number than the packing function is being run on. Here we are | ||
2591 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
2592 | * number. | ||
2593 | * | ||
2594 | * Returns 1 when packing is required and a task should be moved to | ||
2595 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
2596 | * | ||
2597 | * @sd: The sched_domain whose packing is to be checked. | ||
2598 | * @sds: Statistics of the sched_domain which is to be packed | ||
2599 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2600 | * @imbalance: returns amount of imbalanced due to packing. | ||
2601 | */ | ||
2602 | static int check_asym_packing(struct sched_domain *sd, | ||
2603 | struct sd_lb_stats *sds, | ||
2604 | int this_cpu, unsigned long *imbalance) | ||
2605 | { | ||
2606 | int busiest_cpu; | ||
2607 | |||
2608 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
2609 | return 0; | ||
2610 | |||
2611 | if (!sds->busiest) | ||
2612 | return 0; | ||
2613 | |||
2614 | busiest_cpu = group_first_cpu(sds->busiest); | ||
2615 | if (this_cpu > busiest_cpu) | ||
2616 | return 0; | ||
2617 | |||
2618 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
2619 | SCHED_LOAD_SCALE); | ||
2620 | return 1; | ||
2501 | } | 2621 | } |
2502 | 2622 | ||
2503 | /** | 2623 | /** |
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2692 | if (!(*balance)) | 2812 | if (!(*balance)) |
2693 | goto ret; | 2813 | goto ret; |
2694 | 2814 | ||
2815 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
2816 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
2817 | return sds.busiest; | ||
2818 | |||
2695 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2819 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2696 | goto out_balanced; | 2820 | goto out_balanced; |
2697 | 2821 | ||
@@ -2726,8 +2850,9 @@ ret: | |||
2726 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2850 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2727 | */ | 2851 | */ |
2728 | static struct rq * | 2852 | static struct rq * |
2729 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2853 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, |
2730 | unsigned long imbalance, const struct cpumask *cpus) | 2854 | enum cpu_idle_type idle, unsigned long imbalance, |
2855 | const struct cpumask *cpus) | ||
2731 | { | 2856 | { |
2732 | struct rq *busiest = NULL, *rq; | 2857 | struct rq *busiest = NULL, *rq; |
2733 | unsigned long max_load = 0; | 2858 | unsigned long max_load = 0; |
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2738 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 2863 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); |
2739 | unsigned long wl; | 2864 | unsigned long wl; |
2740 | 2865 | ||
2866 | if (!capacity) | ||
2867 | capacity = fix_small_capacity(sd, group); | ||
2868 | |||
2741 | if (!cpumask_test_cpu(i, cpus)) | 2869 | if (!cpumask_test_cpu(i, cpus)) |
2742 | continue; | 2870 | continue; |
2743 | 2871 | ||
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2777 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2905 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2778 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2906 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2779 | 2907 | ||
2780 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2908 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
2909 | int busiest_cpu, int this_cpu) | ||
2781 | { | 2910 | { |
2782 | if (idle == CPU_NEWLY_IDLE) { | 2911 | if (idle == CPU_NEWLY_IDLE) { |
2912 | |||
2913 | /* | ||
2914 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
2915 | * higher numbered CPUs in order to pack all tasks in the | ||
2916 | * lowest numbered CPUs. | ||
2917 | */ | ||
2918 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
2919 | return 1; | ||
2920 | |||
2783 | /* | 2921 | /* |
2784 | * The only task running in a non-idle cpu can be moved to this | 2922 | * The only task running in a non-idle cpu can be moved to this |
2785 | * cpu in an attempt to completely freeup the other CPU | 2923 | * cpu in an attempt to completely freeup the other CPU |
@@ -2854,7 +2992,7 @@ redo: | |||
2854 | goto out_balanced; | 2992 | goto out_balanced; |
2855 | } | 2993 | } |
2856 | 2994 | ||
2857 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | 2995 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); |
2858 | if (!busiest) { | 2996 | if (!busiest) { |
2859 | schedstat_inc(sd, lb_nobusyq[idle]); | 2997 | schedstat_inc(sd, lb_nobusyq[idle]); |
2860 | goto out_balanced; | 2998 | goto out_balanced; |
@@ -2898,7 +3036,8 @@ redo: | |||
2898 | schedstat_inc(sd, lb_failed[idle]); | 3036 | schedstat_inc(sd, lb_failed[idle]); |
2899 | sd->nr_balance_failed++; | 3037 | sd->nr_balance_failed++; |
2900 | 3038 | ||
2901 | if (need_active_balance(sd, sd_idle, idle)) { | 3039 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
3040 | this_cpu)) { | ||
2902 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3041 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2903 | 3042 | ||
2904 | /* don't kick the active_load_balance_cpu_stop, | 3043 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3093,13 +3232,40 @@ out_unlock: | |||
3093 | } | 3232 | } |
3094 | 3233 | ||
3095 | #ifdef CONFIG_NO_HZ | 3234 | #ifdef CONFIG_NO_HZ |
3235 | |||
3236 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3237 | |||
3238 | static void trigger_sched_softirq(void *data) | ||
3239 | { | ||
3240 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3241 | } | ||
3242 | |||
3243 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3244 | { | ||
3245 | csd->func = trigger_sched_softirq; | ||
3246 | csd->info = NULL; | ||
3247 | csd->flags = 0; | ||
3248 | csd->priv = 0; | ||
3249 | } | ||
3250 | |||
3251 | /* | ||
3252 | * idle load balancing details | ||
3253 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3254 | * entering idle. | ||
3255 | * - This idle load balancer CPU will also go into tickless mode when | ||
3256 | * it is idle, just like all other idle CPUs | ||
3257 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
3258 | * needed, they will kick the idle load balancer, which then does idle | ||
3259 | * load balancing for all the idle CPUs. | ||
3260 | */ | ||
3096 | static struct { | 3261 | static struct { |
3097 | atomic_t load_balancer; | 3262 | atomic_t load_balancer; |
3098 | cpumask_var_t cpu_mask; | 3263 | atomic_t first_pick_cpu; |
3099 | cpumask_var_t ilb_grp_nohz_mask; | 3264 | atomic_t second_pick_cpu; |
3100 | } nohz ____cacheline_aligned = { | 3265 | cpumask_var_t idle_cpus_mask; |
3101 | .load_balancer = ATOMIC_INIT(-1), | 3266 | cpumask_var_t grp_idle_mask; |
3102 | }; | 3267 | unsigned long next_balance; /* in jiffy units */ |
3268 | } nohz ____cacheline_aligned; | ||
3103 | 3269 | ||
3104 | int get_nohz_load_balancer(void) | 3270 | int get_nohz_load_balancer(void) |
3105 | { | 3271 | { |
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3153 | */ | 3319 | */ |
3154 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3320 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
3155 | { | 3321 | { |
3156 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3322 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
3157 | sched_group_cpus(ilb_group)); | 3323 | sched_group_cpus(ilb_group)); |
3158 | 3324 | ||
3159 | /* | 3325 | /* |
3160 | * A sched_group is semi-idle when it has atleast one busy cpu | 3326 | * A sched_group is semi-idle when it has atleast one busy cpu |
3161 | * and atleast one idle cpu. | 3327 | * and atleast one idle cpu. |
3162 | */ | 3328 | */ |
3163 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3329 | if (cpumask_empty(nohz.grp_idle_mask)) |
3164 | return 0; | 3330 | return 0; |
3165 | 3331 | ||
3166 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3332 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
3167 | return 0; | 3333 | return 0; |
3168 | 3334 | ||
3169 | return 1; | 3335 | return 1; |
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) | |||
3196 | * Optimize for the case when we have no idle CPUs or only one | 3362 | * Optimize for the case when we have no idle CPUs or only one |
3197 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3363 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
3198 | */ | 3364 | */ |
3199 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3365 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3200 | goto out_done; | 3366 | goto out_done; |
3201 | 3367 | ||
3202 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3368 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) | |||
3204 | 3370 | ||
3205 | do { | 3371 | do { |
3206 | if (is_semi_idle_group(ilb_group)) | 3372 | if (is_semi_idle_group(ilb_group)) |
3207 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3373 | return cpumask_first(nohz.grp_idle_mask); |
3208 | 3374 | ||
3209 | ilb_group = ilb_group->next; | 3375 | ilb_group = ilb_group->next; |
3210 | 3376 | ||
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) | |||
3212 | } | 3378 | } |
3213 | 3379 | ||
3214 | out_done: | 3380 | out_done: |
3215 | return cpumask_first(nohz.cpu_mask); | 3381 | return nr_cpu_ids; |
3216 | } | 3382 | } |
3217 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3383 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3218 | static inline int find_new_ilb(int call_cpu) | 3384 | static inline int find_new_ilb(int call_cpu) |
3219 | { | 3385 | { |
3220 | return cpumask_first(nohz.cpu_mask); | 3386 | return nr_cpu_ids; |
3221 | } | 3387 | } |
3222 | #endif | 3388 | #endif |
3223 | 3389 | ||
3224 | /* | 3390 | /* |
3391 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
3392 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
3393 | * CPU (if there is one). | ||
3394 | */ | ||
3395 | static void nohz_balancer_kick(int cpu) | ||
3396 | { | ||
3397 | int ilb_cpu; | ||
3398 | |||
3399 | nohz.next_balance++; | ||
3400 | |||
3401 | ilb_cpu = get_nohz_load_balancer(); | ||
3402 | |||
3403 | if (ilb_cpu >= nr_cpu_ids) { | ||
3404 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3405 | if (ilb_cpu >= nr_cpu_ids) | ||
3406 | return; | ||
3407 | } | ||
3408 | |||
3409 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
3410 | struct call_single_data *cp; | ||
3411 | |||
3412 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
3413 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
3414 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
3415 | } | ||
3416 | return; | ||
3417 | } | ||
3418 | |||
3419 | /* | ||
3225 | * This routine will try to nominate the ilb (idle load balancing) | 3420 | * This routine will try to nominate the ilb (idle load balancing) |
3226 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3421 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3227 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3422 | * load balancing on behalf of all those cpus. |
3228 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3229 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3230 | * arrives... | ||
3231 | * | ||
3232 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3233 | * for idle load balancing. ilb owner will still be part of | ||
3234 | * nohz.cpu_mask.. | ||
3235 | * | 3423 | * |
3236 | * While stopping the tick, this cpu will become the ilb owner if there | 3424 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
3237 | * is no other owner. And will be the owner till that cpu becomes busy | 3425 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
3238 | * or if all cpus in the system stop their ticks at which point | 3426 | * idle load balancing by kicking one of the idle CPUs. |
3239 | * there is no need for ilb owner. | ||
3240 | * | 3427 | * |
3241 | * When the ilb owner becomes busy, it nominates another owner, during the | 3428 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
3242 | * next busy scheduler_tick() | 3429 | * ilb owner CPU in future (when there is a need for idle load balancing on |
3430 | * behalf of all idle CPUs). | ||
3243 | */ | 3431 | */ |
3244 | int select_nohz_load_balancer(int stop_tick) | 3432 | void select_nohz_load_balancer(int stop_tick) |
3245 | { | 3433 | { |
3246 | int cpu = smp_processor_id(); | 3434 | int cpu = smp_processor_id(); |
3247 | 3435 | ||
3248 | if (stop_tick) { | 3436 | if (stop_tick) { |
3249 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3250 | |||
3251 | if (!cpu_active(cpu)) { | 3437 | if (!cpu_active(cpu)) { |
3252 | if (atomic_read(&nohz.load_balancer) != cpu) | 3438 | if (atomic_read(&nohz.load_balancer) != cpu) |
3253 | return 0; | 3439 | return; |
3254 | 3440 | ||
3255 | /* | 3441 | /* |
3256 | * If we are going offline and still the leader, | 3442 | * If we are going offline and still the leader, |
3257 | * give up! | 3443 | * give up! |
3258 | */ | 3444 | */ |
3259 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3445 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3446 | nr_cpu_ids) != cpu) | ||
3260 | BUG(); | 3447 | BUG(); |
3261 | 3448 | ||
3262 | return 0; | 3449 | return; |
3263 | } | 3450 | } |
3264 | 3451 | ||
3265 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3452 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3266 | 3453 | ||
3267 | /* time for ilb owner also to sleep */ | 3454 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
3268 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3455 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
3269 | if (atomic_read(&nohz.load_balancer) == cpu) | 3456 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
3270 | atomic_set(&nohz.load_balancer, -1); | 3457 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
3271 | return 0; | ||
3272 | } | ||
3273 | 3458 | ||
3274 | if (atomic_read(&nohz.load_balancer) == -1) { | 3459 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
3275 | /* make me the ilb owner */ | ||
3276 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3277 | return 1; | ||
3278 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3279 | int new_ilb; | 3460 | int new_ilb; |
3280 | 3461 | ||
3281 | if (!(sched_smt_power_savings || | 3462 | /* make me the ilb owner */ |
3282 | sched_mc_power_savings)) | 3463 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
3283 | return 1; | 3464 | cpu) != nr_cpu_ids) |
3465 | return; | ||
3466 | |||
3284 | /* | 3467 | /* |
3285 | * Check to see if there is a more power-efficient | 3468 | * Check to see if there is a more power-efficient |
3286 | * ilb. | 3469 | * ilb. |
3287 | */ | 3470 | */ |
3288 | new_ilb = find_new_ilb(cpu); | 3471 | new_ilb = find_new_ilb(cpu); |
3289 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3472 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
3290 | atomic_set(&nohz.load_balancer, -1); | 3473 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
3291 | resched_cpu(new_ilb); | 3474 | resched_cpu(new_ilb); |
3292 | return 0; | 3475 | return; |
3293 | } | 3476 | } |
3294 | return 1; | 3477 | return; |
3295 | } | 3478 | } |
3296 | } else { | 3479 | } else { |
3297 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3480 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
3298 | return 0; | 3481 | return; |
3299 | 3482 | ||
3300 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3483 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
3301 | 3484 | ||
3302 | if (atomic_read(&nohz.load_balancer) == cpu) | 3485 | if (atomic_read(&nohz.load_balancer) == cpu) |
3303 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3486 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3487 | nr_cpu_ids) != cpu) | ||
3304 | BUG(); | 3488 | BUG(); |
3305 | } | 3489 | } |
3306 | return 0; | 3490 | return; |
3307 | } | 3491 | } |
3308 | #endif | 3492 | #endif |
3309 | 3493 | ||
@@ -3385,11 +3569,102 @@ out: | |||
3385 | rq->next_balance = next_balance; | 3569 | rq->next_balance = next_balance; |
3386 | } | 3570 | } |
3387 | 3571 | ||
3572 | #ifdef CONFIG_NO_HZ | ||
3388 | /* | 3573 | /* |
3389 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3574 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
3390 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3391 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3575 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3392 | */ | 3576 | */ |
3577 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
3578 | { | ||
3579 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3580 | struct rq *rq; | ||
3581 | int balance_cpu; | ||
3582 | |||
3583 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
3584 | return; | ||
3585 | |||
3586 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
3587 | if (balance_cpu == this_cpu) | ||
3588 | continue; | ||
3589 | |||
3590 | /* | ||
3591 | * If this cpu gets work to do, stop the load balancing | ||
3592 | * work being done for other cpus. Next load | ||
3593 | * balancing owner will pick it up. | ||
3594 | */ | ||
3595 | if (need_resched()) { | ||
3596 | this_rq->nohz_balance_kick = 0; | ||
3597 | break; | ||
3598 | } | ||
3599 | |||
3600 | raw_spin_lock_irq(&this_rq->lock); | ||
3601 | update_rq_clock(this_rq); | ||
3602 | update_cpu_load(this_rq); | ||
3603 | raw_spin_unlock_irq(&this_rq->lock); | ||
3604 | |||
3605 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3606 | |||
3607 | rq = cpu_rq(balance_cpu); | ||
3608 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3609 | this_rq->next_balance = rq->next_balance; | ||
3610 | } | ||
3611 | nohz.next_balance = this_rq->next_balance; | ||
3612 | this_rq->nohz_balance_kick = 0; | ||
3613 | } | ||
3614 | |||
3615 | /* | ||
3616 | * Current heuristic for kicking the idle load balancer | ||
3617 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
3618 | * idle load balancer when it has more than one process active. This | ||
3619 | * eliminates the need for idle load balancing altogether when we have | ||
3620 | * only one running process in the system (common case). | ||
3621 | * - If there are more than one busy CPU, idle load balancer may have | ||
3622 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
3623 | * SMT or core siblings and can run better if they move to different | ||
3624 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
3625 | * which will kick idle load balancer as soon as it has any load. | ||
3626 | */ | ||
3627 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
3628 | { | ||
3629 | unsigned long now = jiffies; | ||
3630 | int ret; | ||
3631 | int first_pick_cpu, second_pick_cpu; | ||
3632 | |||
3633 | if (time_before(now, nohz.next_balance)) | ||
3634 | return 0; | ||
3635 | |||
3636 | if (!rq->nr_running) | ||
3637 | return 0; | ||
3638 | |||
3639 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
3640 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
3641 | |||
3642 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
3643 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
3644 | return 0; | ||
3645 | |||
3646 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
3647 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3648 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3649 | if (rq->nr_running > 1) | ||
3650 | return 1; | ||
3651 | } else { | ||
3652 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
3653 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3654 | if (rq->nr_running) | ||
3655 | return 1; | ||
3656 | } | ||
3657 | } | ||
3658 | return 0; | ||
3659 | } | ||
3660 | #else | ||
3661 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
3662 | #endif | ||
3663 | |||
3664 | /* | ||
3665 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3666 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
3667 | */ | ||
3393 | static void run_rebalance_domains(struct softirq_action *h) | 3668 | static void run_rebalance_domains(struct softirq_action *h) |
3394 | { | 3669 | { |
3395 | int this_cpu = smp_processor_id(); | 3670 | int this_cpu = smp_processor_id(); |
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3399 | 3674 | ||
3400 | rebalance_domains(this_cpu, idle); | 3675 | rebalance_domains(this_cpu, idle); |
3401 | 3676 | ||
3402 | #ifdef CONFIG_NO_HZ | ||
3403 | /* | 3677 | /* |
3404 | * If this cpu is the owner for idle load balancing, then do the | 3678 | * If this cpu has a pending nohz_balance_kick, then do the |
3405 | * balancing on behalf of the other idle cpus whose ticks are | 3679 | * balancing on behalf of the other idle cpus whose ticks are |
3406 | * stopped. | 3680 | * stopped. |
3407 | */ | 3681 | */ |
3408 | if (this_rq->idle_at_tick && | 3682 | nohz_idle_balance(this_cpu, idle); |
3409 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3410 | struct rq *rq; | ||
3411 | int balance_cpu; | ||
3412 | |||
3413 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3414 | if (balance_cpu == this_cpu) | ||
3415 | continue; | ||
3416 | |||
3417 | /* | ||
3418 | * If this cpu gets work to do, stop the load balancing | ||
3419 | * work being done for other cpus. Next load | ||
3420 | * balancing owner will pick it up. | ||
3421 | */ | ||
3422 | if (need_resched()) | ||
3423 | break; | ||
3424 | |||
3425 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3426 | |||
3427 | rq = cpu_rq(balance_cpu); | ||
3428 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3429 | this_rq->next_balance = rq->next_balance; | ||
3430 | } | ||
3431 | } | ||
3432 | #endif | ||
3433 | } | 3683 | } |
3434 | 3684 | ||
3435 | static inline int on_null_domain(int cpu) | 3685 | static inline int on_null_domain(int cpu) |
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) | |||
3439 | 3689 | ||
3440 | /* | 3690 | /* |
3441 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3691 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3442 | * | ||
3443 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3444 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3445 | * if the whole system is idle. | ||
3446 | */ | 3692 | */ |
3447 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3693 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3448 | { | 3694 | { |
3449 | #ifdef CONFIG_NO_HZ | ||
3450 | /* | ||
3451 | * If we were in the nohz mode recently and busy at the current | ||
3452 | * scheduler tick, then check if we need to nominate new idle | ||
3453 | * load balancer. | ||
3454 | */ | ||
3455 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3456 | rq->in_nohz_recently = 0; | ||
3457 | |||
3458 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3459 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3460 | atomic_set(&nohz.load_balancer, -1); | ||
3461 | } | ||
3462 | |||
3463 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3464 | int ilb = find_new_ilb(cpu); | ||
3465 | |||
3466 | if (ilb < nr_cpu_ids) | ||
3467 | resched_cpu(ilb); | ||
3468 | } | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * If this cpu is idle and doing idle load balancing for all the | ||
3473 | * cpus with ticks stopped, is it time for that to stop? | ||
3474 | */ | ||
3475 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3476 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3477 | resched_cpu(cpu); | ||
3478 | return; | ||
3479 | } | ||
3480 | |||
3481 | /* | ||
3482 | * If this cpu is idle and the idle load balancing is done by | ||
3483 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3484 | */ | ||
3485 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3486 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3487 | return; | ||
3488 | #endif | ||
3489 | /* Don't need to rebalance while attached to NULL domain */ | 3695 | /* Don't need to rebalance while attached to NULL domain */ |
3490 | if (time_after_eq(jiffies, rq->next_balance) && | 3696 | if (time_after_eq(jiffies, rq->next_balance) && |
3491 | likely(!on_null_domain(cpu))) | 3697 | likely(!on_null_domain(cpu))) |
3492 | raise_softirq(SCHED_SOFTIRQ); | 3698 | raise_softirq(SCHED_SOFTIRQ); |
3699 | #ifdef CONFIG_NO_HZ | ||
3700 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
3701 | nohz_balancer_kick(cpu); | ||
3702 | #endif | ||
3493 | } | 3703 | } |
3494 | 3704 | ||
3495 | static void rq_online_fair(struct rq *rq) | 3705 | static void rq_online_fair(struct rq *rq) |