aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c534
1 files changed, 373 insertions, 161 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index eed35eded602..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1240,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1240 * effect of the currently running task from the load 1240 * effect of the currently running task from the load
1241 * of the current CPU: 1241 * of the current CPU:
1242 */ 1242 */
1243 rcu_read_lock();
1243 if (sync) { 1244 if (sync) {
1244 tg = task_group(current); 1245 tg = task_group(current);
1245 weight = current->se.load.weight; 1246 weight = current->se.load.weight;
@@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1275 balanced = this_eff_load <= prev_eff_load; 1276 balanced = this_eff_load <= prev_eff_load;
1276 } else 1277 } else
1277 balanced = true; 1278 balanced = true;
1279 rcu_read_unlock();
1278 1280
1279 /* 1281 /*
1280 * If the currently running task will sleep within 1282 * If the currently running task will sleep within
@@ -2285,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2285 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2286 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2287 2289
2288 if (sched_feat(ARCH_POWER))
2289 power *= arch_scale_freq_power(sd, cpu);
2290 else
2291 power *= default_scale_freq_power(sd, cpu);
2292
2293 power >>= SCHED_LOAD_SHIFT;
2294
2295 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2290 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2296 if (sched_feat(ARCH_POWER)) 2291 if (sched_feat(ARCH_POWER))
2297 power *= arch_scale_smt_power(sd, cpu); 2292 power *= arch_scale_smt_power(sd, cpu);
@@ -2301,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2301 power >>= SCHED_LOAD_SHIFT; 2296 power >>= SCHED_LOAD_SHIFT;
2302 } 2297 }
2303 2298
2299 sdg->cpu_power_orig = power;
2300
2301 if (sched_feat(ARCH_POWER))
2302 power *= arch_scale_freq_power(sd, cpu);
2303 else
2304 power *= default_scale_freq_power(sd, cpu);
2305
2306 power >>= SCHED_LOAD_SHIFT;
2307
2304 power *= scale_rt_power(cpu); 2308 power *= scale_rt_power(cpu);
2305 power >>= SCHED_LOAD_SHIFT; 2309 power >>= SCHED_LOAD_SHIFT;
2306 2310
@@ -2333,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2333 sdg->cpu_power = power; 2337 sdg->cpu_power = power;
2334} 2338}
2335 2339
2340/*
2341 * Try and fix up capacity for tiny siblings, this is needed when
2342 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2343 * which on its own isn't powerful enough.
2344 *
2345 * See update_sd_pick_busiest() and check_asym_packing().
2346 */
2347static inline int
2348fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2349{
2350 /*
2351 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2352 */
2353 if (sd->level != SD_LV_SIBLING)
2354 return 0;
2355
2356 /*
2357 * If ~90% of the cpu_power is still there, we're good.
2358 */
2359 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2360 return 1;
2361
2362 return 0;
2363}
2364
2336/** 2365/**
2337 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2366 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2338 * @sd: The sched_domain whose statistics are to be updated. 2367 * @sd: The sched_domain whose statistics are to be updated.
@@ -2398,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2398 * domains. In the newly idle case, we will allow all the cpu's 2427 * domains. In the newly idle case, we will allow all the cpu's
2399 * to do the newly idle load balance. 2428 * to do the newly idle load balance.
2400 */ 2429 */
2401 if (idle != CPU_NEWLY_IDLE && local_group && 2430 if (idle != CPU_NEWLY_IDLE && local_group) {
2402 balance_cpu != this_cpu) { 2431 if (balance_cpu != this_cpu) {
2403 *balance = 0; 2432 *balance = 0;
2404 return; 2433 return;
2434 }
2435 update_group_power(sd, this_cpu);
2405 } 2436 }
2406 2437
2407 update_group_power(sd, this_cpu);
2408
2409 /* Adjust by relative CPU power of the group */ 2438 /* Adjust by relative CPU power of the group */
2410 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2439 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2411 2440
@@ -2426,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2426 2455
2427 sgs->group_capacity = 2456 sgs->group_capacity =
2428 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group);
2460}
2461
2462/**
2463 * update_sd_pick_busiest - return 1 on busiest group
2464 * @sd: sched_domain whose statistics are to be checked
2465 * @sds: sched_domain statistics
2466 * @sg: sched_group candidate to be checked for being the busiest
2467 * @sgs: sched_group statistics
2468 * @this_cpu: the current cpu
2469 *
2470 * Determine if @sg is a busier group than the previously selected
2471 * busiest group.
2472 */
2473static bool update_sd_pick_busiest(struct sched_domain *sd,
2474 struct sd_lb_stats *sds,
2475 struct sched_group *sg,
2476 struct sg_lb_stats *sgs,
2477 int this_cpu)
2478{
2479 if (sgs->avg_load <= sds->max_load)
2480 return false;
2481
2482 if (sgs->sum_nr_running > sgs->group_capacity)
2483 return true;
2484
2485 if (sgs->group_imb)
2486 return true;
2487
2488 /*
2489 * ASYM_PACKING needs to move all the work to the lowest
2490 * numbered CPUs in the group, therefore mark all groups
2491 * higher than ourself as busy.
2492 */
2493 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2494 this_cpu < group_first_cpu(sg)) {
2495 if (!sds->busiest)
2496 return true;
2497
2498 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2499 return true;
2500 }
2501
2502 return false;
2429} 2503}
2430 2504
2431/** 2505/**
@@ -2433,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2433 * @sd: sched_domain whose statistics are to be updated. 2507 * @sd: sched_domain whose statistics are to be updated.
2434 * @this_cpu: Cpu for which load balance is currently performed. 2508 * @this_cpu: Cpu for which load balance is currently performed.
2435 * @idle: Idle status of this_cpu 2509 * @idle: Idle status of this_cpu
2436 * @sd_idle: Idle status of the sched_domain containing group. 2510 * @sd_idle: Idle status of the sched_domain containing sg.
2437 * @cpus: Set of cpus considered for load balancing. 2511 * @cpus: Set of cpus considered for load balancing.
2438 * @balance: Should we balance. 2512 * @balance: Should we balance.
2439 * @sds: variable to hold the statistics for this sched_domain. 2513 * @sds: variable to hold the statistics for this sched_domain.
@@ -2444,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2444 struct sd_lb_stats *sds) 2518 struct sd_lb_stats *sds)
2445{ 2519{
2446 struct sched_domain *child = sd->child; 2520 struct sched_domain *child = sd->child;
2447 struct sched_group *group = sd->groups; 2521 struct sched_group *sg = sd->groups;
2448 struct sg_lb_stats sgs; 2522 struct sg_lb_stats sgs;
2449 int load_idx, prefer_sibling = 0; 2523 int load_idx, prefer_sibling = 0;
2450 2524
@@ -2457,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2457 do { 2531 do {
2458 int local_group; 2532 int local_group;
2459 2533
2460 local_group = cpumask_test_cpu(this_cpu, 2534 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2461 sched_group_cpus(group));
2462 memset(&sgs, 0, sizeof(sgs)); 2535 memset(&sgs, 0, sizeof(sgs));
2463 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2536 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2464 local_group, cpus, balance, &sgs); 2537 local_group, cpus, balance, &sgs);
2465 2538
2466 if (local_group && !(*balance)) 2539 if (local_group && !(*balance))
2467 return; 2540 return;
2468 2541
2469 sds->total_load += sgs.group_load; 2542 sds->total_load += sgs.group_load;
2470 sds->total_pwr += group->cpu_power; 2543 sds->total_pwr += sg->cpu_power;
2471 2544
2472 /* 2545 /*
2473 * In case the child domain prefers tasks go to siblings 2546 * In case the child domain prefers tasks go to siblings
2474 * first, lower the group capacity to one so that we'll try 2547 * first, lower the sg capacity to one so that we'll try
2475 * and move all the excess tasks away. 2548 * and move all the excess tasks away.
2476 */ 2549 */
2477 if (prefer_sibling) 2550 if (prefer_sibling)
@@ -2479,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2479 2552
2480 if (local_group) { 2553 if (local_group) {
2481 sds->this_load = sgs.avg_load; 2554 sds->this_load = sgs.avg_load;
2482 sds->this = group; 2555 sds->this = sg;
2483 sds->this_nr_running = sgs.sum_nr_running; 2556 sds->this_nr_running = sgs.sum_nr_running;
2484 sds->this_load_per_task = sgs.sum_weighted_load; 2557 sds->this_load_per_task = sgs.sum_weighted_load;
2485 } else if (sgs.avg_load > sds->max_load && 2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2486 (sgs.sum_nr_running > sgs.group_capacity ||
2487 sgs.group_imb)) {
2488 sds->max_load = sgs.avg_load; 2559 sds->max_load = sgs.avg_load;
2489 sds->busiest = group; 2560 sds->busiest = sg;
2490 sds->busiest_nr_running = sgs.sum_nr_running; 2561 sds->busiest_nr_running = sgs.sum_nr_running;
2491 sds->busiest_group_capacity = sgs.group_capacity; 2562 sds->busiest_group_capacity = sgs.group_capacity;
2492 sds->busiest_load_per_task = sgs.sum_weighted_load; 2563 sds->busiest_load_per_task = sgs.sum_weighted_load;
2493 sds->group_imb = sgs.group_imb; 2564 sds->group_imb = sgs.group_imb;
2494 } 2565 }
2495 2566
2496 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2567 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2497 group = group->next; 2568 sg = sg->next;
2498 } while (group != sd->groups); 2569 } while (sg != sd->groups);
2570}
2571
2572int __weak arch_sd_sibling_asym_packing(void)
2573{
2574 return 0*SD_ASYM_PACKING;
2575}
2576
2577/**
2578 * check_asym_packing - Check to see if the group is packed into the
2579 * sched doman.
2580 *
2581 * This is primarily intended to used at the sibling level. Some
2582 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2583 * case of POWER7, it can move to lower SMT modes only when higher
2584 * threads are idle. When in lower SMT modes, the threads will
2585 * perform better since they share less core resources. Hence when we
2586 * have idle threads, we want them to be the higher ones.
2587 *
2588 * This packing function is run on idle threads. It checks to see if
2589 * the busiest CPU in this domain (core in the P7 case) has a higher
2590 * CPU number than the packing function is being run on. Here we are
2591 * assuming lower CPU number will be equivalent to lower a SMT thread
2592 * number.
2593 *
2594 * Returns 1 when packing is required and a task should be moved to
2595 * this CPU. The amount of the imbalance is returned in *imbalance.
2596 *
2597 * @sd: The sched_domain whose packing is to be checked.
2598 * @sds: Statistics of the sched_domain which is to be packed
2599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2600 * @imbalance: returns amount of imbalanced due to packing.
2601 */
2602static int check_asym_packing(struct sched_domain *sd,
2603 struct sd_lb_stats *sds,
2604 int this_cpu, unsigned long *imbalance)
2605{
2606 int busiest_cpu;
2607
2608 if (!(sd->flags & SD_ASYM_PACKING))
2609 return 0;
2610
2611 if (!sds->busiest)
2612 return 0;
2613
2614 busiest_cpu = group_first_cpu(sds->busiest);
2615 if (this_cpu > busiest_cpu)
2616 return 0;
2617
2618 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2619 SCHED_LOAD_SCALE);
2620 return 1;
2499} 2621}
2500 2622
2501/** 2623/**
@@ -2690,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2690 if (!(*balance)) 2812 if (!(*balance))
2691 goto ret; 2813 goto ret;
2692 2814
2815 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2816 check_asym_packing(sd, &sds, this_cpu, imbalance))
2817 return sds.busiest;
2818
2693 if (!sds.busiest || sds.busiest_nr_running == 0) 2819 if (!sds.busiest || sds.busiest_nr_running == 0)
2694 goto out_balanced; 2820 goto out_balanced;
2695 2821
@@ -2724,8 +2850,9 @@ ret:
2724 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2850 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2725 */ 2851 */
2726static struct rq * 2852static struct rq *
2727find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2853find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2728 unsigned long imbalance, const struct cpumask *cpus) 2854 enum cpu_idle_type idle, unsigned long imbalance,
2855 const struct cpumask *cpus)
2729{ 2856{
2730 struct rq *busiest = NULL, *rq; 2857 struct rq *busiest = NULL, *rq;
2731 unsigned long max_load = 0; 2858 unsigned long max_load = 0;
@@ -2736,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2736 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2863 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2737 unsigned long wl; 2864 unsigned long wl;
2738 2865
2866 if (!capacity)
2867 capacity = fix_small_capacity(sd, group);
2868
2739 if (!cpumask_test_cpu(i, cpus)) 2869 if (!cpumask_test_cpu(i, cpus))
2740 continue; 2870 continue;
2741 2871
@@ -2775,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2775/* Working cpumask for load_balance and load_balance_newidle. */ 2905/* Working cpumask for load_balance and load_balance_newidle. */
2776static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2906static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2777 2907
2778static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2908static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2909 int busiest_cpu, int this_cpu)
2779{ 2910{
2780 if (idle == CPU_NEWLY_IDLE) { 2911 if (idle == CPU_NEWLY_IDLE) {
2912
2913 /*
2914 * ASYM_PACKING needs to force migrate tasks from busy but
2915 * higher numbered CPUs in order to pack all tasks in the
2916 * lowest numbered CPUs.
2917 */
2918 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2919 return 1;
2920
2781 /* 2921 /*
2782 * The only task running in a non-idle cpu can be moved to this 2922 * The only task running in a non-idle cpu can be moved to this
2783 * cpu in an attempt to completely freeup the other CPU 2923 * cpu in an attempt to completely freeup the other CPU
@@ -2852,7 +2992,7 @@ redo:
2852 goto out_balanced; 2992 goto out_balanced;
2853 } 2993 }
2854 2994
2855 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2995 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2856 if (!busiest) { 2996 if (!busiest) {
2857 schedstat_inc(sd, lb_nobusyq[idle]); 2997 schedstat_inc(sd, lb_nobusyq[idle]);
2858 goto out_balanced; 2998 goto out_balanced;
@@ -2896,7 +3036,8 @@ redo:
2896 schedstat_inc(sd, lb_failed[idle]); 3036 schedstat_inc(sd, lb_failed[idle]);
2897 sd->nr_balance_failed++; 3037 sd->nr_balance_failed++;
2898 3038
2899 if (need_active_balance(sd, sd_idle, idle)) { 3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) {
2900 raw_spin_lock_irqsave(&busiest->lock, flags); 3041 raw_spin_lock_irqsave(&busiest->lock, flags);
2901 3042
2902 /* don't kick the active_load_balance_cpu_stop, 3043 /* don't kick the active_load_balance_cpu_stop,
@@ -3091,13 +3232,40 @@ out_unlock:
3091} 3232}
3092 3233
3093#ifdef CONFIG_NO_HZ 3234#ifdef CONFIG_NO_HZ
3235
3236static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3237
3238static void trigger_sched_softirq(void *data)
3239{
3240 raise_softirq_irqoff(SCHED_SOFTIRQ);
3241}
3242
3243static inline void init_sched_softirq_csd(struct call_single_data *csd)
3244{
3245 csd->func = trigger_sched_softirq;
3246 csd->info = NULL;
3247 csd->flags = 0;
3248 csd->priv = 0;
3249}
3250
3251/*
3252 * idle load balancing details
3253 * - One of the idle CPUs nominates itself as idle load_balancer, while
3254 * entering idle.
3255 * - This idle load balancer CPU will also go into tickless mode when
3256 * it is idle, just like all other idle CPUs
3257 * - When one of the busy CPUs notice that there may be an idle rebalancing
3258 * needed, they will kick the idle load balancer, which then does idle
3259 * load balancing for all the idle CPUs.
3260 */
3094static struct { 3261static struct {
3095 atomic_t load_balancer; 3262 atomic_t load_balancer;
3096 cpumask_var_t cpu_mask; 3263 atomic_t first_pick_cpu;
3097 cpumask_var_t ilb_grp_nohz_mask; 3264 atomic_t second_pick_cpu;
3098} nohz ____cacheline_aligned = { 3265 cpumask_var_t idle_cpus_mask;
3099 .load_balancer = ATOMIC_INIT(-1), 3266 cpumask_var_t grp_idle_mask;
3100}; 3267 unsigned long next_balance; /* in jiffy units */
3268} nohz ____cacheline_aligned;
3101 3269
3102int get_nohz_load_balancer(void) 3270int get_nohz_load_balancer(void)
3103{ 3271{
@@ -3151,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3151 */ 3319 */
3152static inline int is_semi_idle_group(struct sched_group *ilb_group) 3320static inline int is_semi_idle_group(struct sched_group *ilb_group)
3153{ 3321{
3154 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3322 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3155 sched_group_cpus(ilb_group)); 3323 sched_group_cpus(ilb_group));
3156 3324
3157 /* 3325 /*
3158 * A sched_group is semi-idle when it has atleast one busy cpu 3326 * A sched_group is semi-idle when it has atleast one busy cpu
3159 * and atleast one idle cpu. 3327 * and atleast one idle cpu.
3160 */ 3328 */
3161 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3329 if (cpumask_empty(nohz.grp_idle_mask))
3162 return 0; 3330 return 0;
3163 3331
3164 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3332 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3165 return 0; 3333 return 0;
3166 3334
3167 return 1; 3335 return 1;
@@ -3194,7 +3362,7 @@ static int find_new_ilb(int cpu)
3194 * Optimize for the case when we have no idle CPUs or only one 3362 * Optimize for the case when we have no idle CPUs or only one
3195 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3363 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3196 */ 3364 */
3197 if (cpumask_weight(nohz.cpu_mask) < 2) 3365 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3198 goto out_done; 3366 goto out_done;
3199 3367
3200 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3368 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3202,7 +3370,7 @@ static int find_new_ilb(int cpu)
3202 3370
3203 do { 3371 do {
3204 if (is_semi_idle_group(ilb_group)) 3372 if (is_semi_idle_group(ilb_group))
3205 return cpumask_first(nohz.ilb_grp_nohz_mask); 3373 return cpumask_first(nohz.grp_idle_mask);
3206 3374
3207 ilb_group = ilb_group->next; 3375 ilb_group = ilb_group->next;
3208 3376
@@ -3210,98 +3378,116 @@ static int find_new_ilb(int cpu)
3210 } 3378 }
3211 3379
3212out_done: 3380out_done:
3213 return cpumask_first(nohz.cpu_mask); 3381 return nr_cpu_ids;
3214} 3382}
3215#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3383#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3216static inline int find_new_ilb(int call_cpu) 3384static inline int find_new_ilb(int call_cpu)
3217{ 3385{
3218 return cpumask_first(nohz.cpu_mask); 3386 return nr_cpu_ids;
3219} 3387}
3220#endif 3388#endif
3221 3389
3222/* 3390/*
3391 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3392 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3393 * CPU (if there is one).
3394 */
3395static void nohz_balancer_kick(int cpu)
3396{
3397 int ilb_cpu;
3398
3399 nohz.next_balance++;
3400
3401 ilb_cpu = get_nohz_load_balancer();
3402
3403 if (ilb_cpu >= nr_cpu_ids) {
3404 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3405 if (ilb_cpu >= nr_cpu_ids)
3406 return;
3407 }
3408
3409 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3410 struct call_single_data *cp;
3411
3412 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3413 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3414 __smp_call_function_single(ilb_cpu, cp, 0);
3415 }
3416 return;
3417}
3418
3419/*
3223 * This routine will try to nominate the ilb (idle load balancing) 3420 * This routine will try to nominate the ilb (idle load balancing)
3224 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3421 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3225 * load balancing on behalf of all those cpus. If all the cpus in the system 3422 * load balancing on behalf of all those cpus.
3226 * go into this tickless mode, then there will be no ilb owner (as there is
3227 * no need for one) and all the cpus will sleep till the next wakeup event
3228 * arrives...
3229 *
3230 * For the ilb owner, tick is not stopped. And this tick will be used
3231 * for idle load balancing. ilb owner will still be part of
3232 * nohz.cpu_mask..
3233 * 3423 *
3234 * While stopping the tick, this cpu will become the ilb owner if there 3424 * When the ilb owner becomes busy, we will not have new ilb owner until some
3235 * is no other owner. And will be the owner till that cpu becomes busy 3425 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3236 * or if all cpus in the system stop their ticks at which point 3426 * idle load balancing by kicking one of the idle CPUs.
3237 * there is no need for ilb owner.
3238 * 3427 *
3239 * When the ilb owner becomes busy, it nominates another owner, during the 3428 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3240 * next busy scheduler_tick() 3429 * ilb owner CPU in future (when there is a need for idle load balancing on
3430 * behalf of all idle CPUs).
3241 */ 3431 */
3242int select_nohz_load_balancer(int stop_tick) 3432void select_nohz_load_balancer(int stop_tick)
3243{ 3433{
3244 int cpu = smp_processor_id(); 3434 int cpu = smp_processor_id();
3245 3435
3246 if (stop_tick) { 3436 if (stop_tick) {
3247 cpu_rq(cpu)->in_nohz_recently = 1;
3248
3249 if (!cpu_active(cpu)) { 3437 if (!cpu_active(cpu)) {
3250 if (atomic_read(&nohz.load_balancer) != cpu) 3438 if (atomic_read(&nohz.load_balancer) != cpu)
3251 return 0; 3439 return;
3252 3440
3253 /* 3441 /*
3254 * If we are going offline and still the leader, 3442 * If we are going offline and still the leader,
3255 * give up! 3443 * give up!
3256 */ 3444 */
3257 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3445 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3446 nr_cpu_ids) != cpu)
3258 BUG(); 3447 BUG();
3259 3448
3260 return 0; 3449 return;
3261 } 3450 }
3262 3451
3263 cpumask_set_cpu(cpu, nohz.cpu_mask); 3452 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3264 3453
3265 /* time for ilb owner also to sleep */ 3454 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3266 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3455 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3267 if (atomic_read(&nohz.load_balancer) == cpu) 3456 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3268 atomic_set(&nohz.load_balancer, -1); 3457 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3269 return 0;
3270 }
3271 3458
3272 if (atomic_read(&nohz.load_balancer) == -1) { 3459 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3273 /* make me the ilb owner */
3274 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3275 return 1;
3276 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3277 int new_ilb; 3460 int new_ilb;
3278 3461
3279 if (!(sched_smt_power_savings || 3462 /* make me the ilb owner */
3280 sched_mc_power_savings)) 3463 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3281 return 1; 3464 cpu) != nr_cpu_ids)
3465 return;
3466
3282 /* 3467 /*
3283 * Check to see if there is a more power-efficient 3468 * Check to see if there is a more power-efficient
3284 * ilb. 3469 * ilb.
3285 */ 3470 */
3286 new_ilb = find_new_ilb(cpu); 3471 new_ilb = find_new_ilb(cpu);
3287 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3472 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3288 atomic_set(&nohz.load_balancer, -1); 3473 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3289 resched_cpu(new_ilb); 3474 resched_cpu(new_ilb);
3290 return 0; 3475 return;
3291 } 3476 }
3292 return 1; 3477 return;
3293 } 3478 }
3294 } else { 3479 } else {
3295 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3480 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3296 return 0; 3481 return;
3297 3482
3298 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3483 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3299 3484
3300 if (atomic_read(&nohz.load_balancer) == cpu) 3485 if (atomic_read(&nohz.load_balancer) == cpu)
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3486 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3487 nr_cpu_ids) != cpu)
3302 BUG(); 3488 BUG();
3303 } 3489 }
3304 return 0; 3490 return;
3305} 3491}
3306#endif 3492#endif
3307 3493
@@ -3383,11 +3569,102 @@ out:
3383 rq->next_balance = next_balance; 3569 rq->next_balance = next_balance;
3384} 3570}
3385 3571
3572#ifdef CONFIG_NO_HZ
3386/* 3573/*
3387 * run_rebalance_domains is triggered when needed from the scheduler tick. 3574 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3388 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3389 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3575 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3390 */ 3576 */
3577static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3578{
3579 struct rq *this_rq = cpu_rq(this_cpu);
3580 struct rq *rq;
3581 int balance_cpu;
3582
3583 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3584 return;
3585
3586 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3587 if (balance_cpu == this_cpu)
3588 continue;
3589
3590 /*
3591 * If this cpu gets work to do, stop the load balancing
3592 * work being done for other cpus. Next load
3593 * balancing owner will pick it up.
3594 */
3595 if (need_resched()) {
3596 this_rq->nohz_balance_kick = 0;
3597 break;
3598 }
3599
3600 raw_spin_lock_irq(&this_rq->lock);
3601 update_rq_clock(this_rq);
3602 update_cpu_load(this_rq);
3603 raw_spin_unlock_irq(&this_rq->lock);
3604
3605 rebalance_domains(balance_cpu, CPU_IDLE);
3606
3607 rq = cpu_rq(balance_cpu);
3608 if (time_after(this_rq->next_balance, rq->next_balance))
3609 this_rq->next_balance = rq->next_balance;
3610 }
3611 nohz.next_balance = this_rq->next_balance;
3612 this_rq->nohz_balance_kick = 0;
3613}
3614
3615/*
3616 * Current heuristic for kicking the idle load balancer
3617 * - first_pick_cpu is the one of the busy CPUs. It will kick
3618 * idle load balancer when it has more than one process active. This
3619 * eliminates the need for idle load balancing altogether when we have
3620 * only one running process in the system (common case).
3621 * - If there are more than one busy CPU, idle load balancer may have
3622 * to run for active_load_balance to happen (i.e., two busy CPUs are
3623 * SMT or core siblings and can run better if they move to different
3624 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3625 * which will kick idle load balancer as soon as it has any load.
3626 */
3627static inline int nohz_kick_needed(struct rq *rq, int cpu)
3628{
3629 unsigned long now = jiffies;
3630 int ret;
3631 int first_pick_cpu, second_pick_cpu;
3632
3633 if (time_before(now, nohz.next_balance))
3634 return 0;
3635
3636 if (!rq->nr_running)
3637 return 0;
3638
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3640 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3641
3642 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3643 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3644 return 0;
3645
3646 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3647 if (ret == nr_cpu_ids || ret == cpu) {
3648 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3649 if (rq->nr_running > 1)
3650 return 1;
3651 } else {
3652 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3653 if (ret == nr_cpu_ids || ret == cpu) {
3654 if (rq->nr_running)
3655 return 1;
3656 }
3657 }
3658 return 0;
3659}
3660#else
3661static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3662#endif
3663
3664/*
3665 * run_rebalance_domains is triggered when needed from the scheduler tick.
3666 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3667 */
3391static void run_rebalance_domains(struct softirq_action *h) 3668static void run_rebalance_domains(struct softirq_action *h)
3392{ 3669{
3393 int this_cpu = smp_processor_id(); 3670 int this_cpu = smp_processor_id();
@@ -3397,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3397 3674
3398 rebalance_domains(this_cpu, idle); 3675 rebalance_domains(this_cpu, idle);
3399 3676
3400#ifdef CONFIG_NO_HZ
3401 /* 3677 /*
3402 * If this cpu is the owner for idle load balancing, then do the 3678 * If this cpu has a pending nohz_balance_kick, then do the
3403 * balancing on behalf of the other idle cpus whose ticks are 3679 * balancing on behalf of the other idle cpus whose ticks are
3404 * stopped. 3680 * stopped.
3405 */ 3681 */
3406 if (this_rq->idle_at_tick && 3682 nohz_idle_balance(this_cpu, idle);
3407 atomic_read(&nohz.load_balancer) == this_cpu) {
3408 struct rq *rq;
3409 int balance_cpu;
3410
3411 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3412 if (balance_cpu == this_cpu)
3413 continue;
3414
3415 /*
3416 * If this cpu gets work to do, stop the load balancing
3417 * work being done for other cpus. Next load
3418 * balancing owner will pick it up.
3419 */
3420 if (need_resched())
3421 break;
3422
3423 rebalance_domains(balance_cpu, CPU_IDLE);
3424
3425 rq = cpu_rq(balance_cpu);
3426 if (time_after(this_rq->next_balance, rq->next_balance))
3427 this_rq->next_balance = rq->next_balance;
3428 }
3429 }
3430#endif
3431} 3683}
3432 3684
3433static inline int on_null_domain(int cpu) 3685static inline int on_null_domain(int cpu)
@@ -3437,57 +3689,17 @@ static inline int on_null_domain(int cpu)
3437 3689
3438/* 3690/*
3439 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3691 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3440 *
3441 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3442 * idle load balancing owner or decide to stop the periodic load balancing,
3443 * if the whole system is idle.
3444 */ 3692 */
3445static inline void trigger_load_balance(struct rq *rq, int cpu) 3693static inline void trigger_load_balance(struct rq *rq, int cpu)
3446{ 3694{
3447#ifdef CONFIG_NO_HZ
3448 /*
3449 * If we were in the nohz mode recently and busy at the current
3450 * scheduler tick, then check if we need to nominate new idle
3451 * load balancer.
3452 */
3453 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3454 rq->in_nohz_recently = 0;
3455
3456 if (atomic_read(&nohz.load_balancer) == cpu) {
3457 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3458 atomic_set(&nohz.load_balancer, -1);
3459 }
3460
3461 if (atomic_read(&nohz.load_balancer) == -1) {
3462 int ilb = find_new_ilb(cpu);
3463
3464 if (ilb < nr_cpu_ids)
3465 resched_cpu(ilb);
3466 }
3467 }
3468
3469 /*
3470 * If this cpu is idle and doing idle load balancing for all the
3471 * cpus with ticks stopped, is it time for that to stop?
3472 */
3473 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3474 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3475 resched_cpu(cpu);
3476 return;
3477 }
3478
3479 /*
3480 * If this cpu is idle and the idle load balancing is done by
3481 * someone else, then no need raise the SCHED_SOFTIRQ
3482 */
3483 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3484 cpumask_test_cpu(cpu, nohz.cpu_mask))
3485 return;
3486#endif
3487 /* Don't need to rebalance while attached to NULL domain */ 3695 /* Don't need to rebalance while attached to NULL domain */
3488 if (time_after_eq(jiffies, rq->next_balance) && 3696 if (time_after_eq(jiffies, rq->next_balance) &&
3489 likely(!on_null_domain(cpu))) 3697 likely(!on_null_domain(cpu)))
3490 raise_softirq(SCHED_SOFTIRQ); 3698 raise_softirq(SCHED_SOFTIRQ);
3699#ifdef CONFIG_NO_HZ
3700 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3701 nohz_balancer_kick(cpu);
3702#endif
3491} 3703}
3492 3704
3493static void rq_online_fair(struct rq *rq) 3705static void rq_online_fair(struct rq *rq)