aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c545
1 files changed, 377 insertions, 168 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..db3f674ca49d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 2000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 3; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -1313,7 +1313,7 @@ static struct sched_group *
1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1314 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1315{ 1315{
1316 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1319 1319
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1348 1348
1349 if (local_group) { 1349 if (local_group) {
1350 this_load = avg_load; 1350 this_load = avg_load;
1351 this = group;
1352 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1353 min_load = avg_load; 1352 min_load = avg_load;
1354 idlest = group; 1353 idlest = group;
@@ -2268,8 +2267,6 @@ unsigned long scale_rt_power(int cpu)
2268 struct rq *rq = cpu_rq(cpu); 2267 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available; 2268 u64 total, available;
2270 2269
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2270 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg; 2271 available = total - rq->rt_avg;
2275 2272
@@ -2287,13 +2284,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2284 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2285 struct sched_group *sdg = sd->groups;
2289 2286
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2288 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2289 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2293,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2293 power >>= SCHED_LOAD_SHIFT;
2304 } 2294 }
2305 2295
2296 sdg->cpu_power_orig = power;
2297
2298 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_freq_power(sd, cpu);
2300 else
2301 power *= default_scale_freq_power(sd, cpu);
2302
2303 power >>= SCHED_LOAD_SHIFT;
2304
2306 power *= scale_rt_power(cpu); 2305 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2306 power >>= SCHED_LOAD_SHIFT;
2308 2307
@@ -2335,6 +2334,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2334 sdg->cpu_power = power;
2336} 2335}
2337 2336
2337/*
2338 * Try and fix up capacity for tiny siblings, this is needed when
2339 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2340 * which on its own isn't powerful enough.
2341 *
2342 * See update_sd_pick_busiest() and check_asym_packing().
2343 */
2344static inline int
2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2346{
2347 /*
2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2349 */
2350 if (sd->level != SD_LV_SIBLING)
2351 return 0;
2352
2353 /*
2354 * If ~90% of the cpu_power is still there, we're good.
2355 */
2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2357 return 1;
2358
2359 return 0;
2360}
2361
2338/** 2362/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2363 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2364 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2424,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2424 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2425 * to do the newly idle load balance.
2402 */ 2426 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2427 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2428 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2429 *balance = 0;
2406 return; 2430 return;
2431 }
2432 update_group_power(sd, this_cpu);
2407 } 2433 }
2408 2434
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2435 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2437
@@ -2428,6 +2452,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2452
2429 sgs->group_capacity = 2453 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group);
2457}
2458
2459/**
2460 * update_sd_pick_busiest - return 1 on busiest group
2461 * @sd: sched_domain whose statistics are to be checked
2462 * @sds: sched_domain statistics
2463 * @sg: sched_group candidate to be checked for being the busiest
2464 * @sgs: sched_group statistics
2465 * @this_cpu: the current cpu
2466 *
2467 * Determine if @sg is a busier group than the previously selected
2468 * busiest group.
2469 */
2470static bool update_sd_pick_busiest(struct sched_domain *sd,
2471 struct sd_lb_stats *sds,
2472 struct sched_group *sg,
2473 struct sg_lb_stats *sgs,
2474 int this_cpu)
2475{
2476 if (sgs->avg_load <= sds->max_load)
2477 return false;
2478
2479 if (sgs->sum_nr_running > sgs->group_capacity)
2480 return true;
2481
2482 if (sgs->group_imb)
2483 return true;
2484
2485 /*
2486 * ASYM_PACKING needs to move all the work to the lowest
2487 * numbered CPUs in the group, therefore mark all groups
2488 * higher than ourself as busy.
2489 */
2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2491 this_cpu < group_first_cpu(sg)) {
2492 if (!sds->busiest)
2493 return true;
2494
2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2496 return true;
2497 }
2498
2499 return false;
2431} 2500}
2432 2501
2433/** 2502/**
@@ -2435,7 +2504,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2504 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2505 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2506 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2507 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2508 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2509 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2510 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2515 struct sd_lb_stats *sds)
2447{ 2516{
2448 struct sched_domain *child = sd->child; 2517 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2518 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2519 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2520 int load_idx, prefer_sibling = 0;
2452 2521
@@ -2459,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2528 do {
2460 int local_group; 2529 int local_group;
2461 2530
2462 local_group = cpumask_test_cpu(this_cpu, 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2532 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2534 local_group, cpus, balance, &sgs);
2467 2535
2468 if (local_group && !(*balance)) 2536 if (local_group && !(*balance))
2469 return; 2537 return;
2470 2538
2471 sds->total_load += sgs.group_load; 2539 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2540 sds->total_pwr += sg->cpu_power;
2473 2541
2474 /* 2542 /*
2475 * In case the child domain prefers tasks go to siblings 2543 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2544 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2545 * and move all the excess tasks away.
2478 */ 2546 */
2479 if (prefer_sibling) 2547 if (prefer_sibling)
@@ -2481,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2549
2482 if (local_group) { 2550 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2551 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2552 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2553 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2554 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2556 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2557 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2558 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2559 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2561 sds->group_imb = sgs.group_imb;
2496 } 2562 }
2497 2563
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2565 sg = sg->next;
2500 } while (group != sd->groups); 2566 } while (sg != sd->groups);
2567}
2568
2569int __weak arch_sd_sibling_asym_packing(void)
2570{
2571 return 0*SD_ASYM_PACKING;
2572}
2573
2574/**
2575 * check_asym_packing - Check to see if the group is packed into the
2576 * sched doman.
2577 *
2578 * This is primarily intended to used at the sibling level. Some
2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2580 * case of POWER7, it can move to lower SMT modes only when higher
2581 * threads are idle. When in lower SMT modes, the threads will
2582 * perform better since they share less core resources. Hence when we
2583 * have idle threads, we want them to be the higher ones.
2584 *
2585 * This packing function is run on idle threads. It checks to see if
2586 * the busiest CPU in this domain (core in the P7 case) has a higher
2587 * CPU number than the packing function is being run on. Here we are
2588 * assuming lower CPU number will be equivalent to lower a SMT thread
2589 * number.
2590 *
2591 * Returns 1 when packing is required and a task should be moved to
2592 * this CPU. The amount of the imbalance is returned in *imbalance.
2593 *
2594 * @sd: The sched_domain whose packing is to be checked.
2595 * @sds: Statistics of the sched_domain which is to be packed
2596 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2597 * @imbalance: returns amount of imbalanced due to packing.
2598 */
2599static int check_asym_packing(struct sched_domain *sd,
2600 struct sd_lb_stats *sds,
2601 int this_cpu, unsigned long *imbalance)
2602{
2603 int busiest_cpu;
2604
2605 if (!(sd->flags & SD_ASYM_PACKING))
2606 return 0;
2607
2608 if (!sds->busiest)
2609 return 0;
2610
2611 busiest_cpu = group_first_cpu(sds->busiest);
2612 if (this_cpu > busiest_cpu)
2613 return 0;
2614
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2616 SCHED_LOAD_SCALE);
2617 return 1;
2501} 2618}
2502 2619
2503/** 2620/**
@@ -2692,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2809 if (!(*balance))
2693 goto ret; 2810 goto ret;
2694 2811
2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2813 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest;
2815
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2816 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2817 goto out_balanced;
2697 2818
@@ -2726,8 +2847,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2847 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2848 */
2728static struct rq * 2849static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2850find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2851 enum cpu_idle_type idle, unsigned long imbalance,
2852 const struct cpumask *cpus)
2731{ 2853{
2732 struct rq *busiest = NULL, *rq; 2854 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2855 unsigned long max_load = 0;
@@ -2738,6 +2860,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2861 unsigned long wl;
2740 2862
2863 if (!capacity)
2864 capacity = fix_small_capacity(sd, group);
2865
2741 if (!cpumask_test_cpu(i, cpus)) 2866 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2867 continue;
2743 2868
@@ -2777,9 +2902,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2902/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2904
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2906 int busiest_cpu, int this_cpu)
2781{ 2907{
2782 if (idle == CPU_NEWLY_IDLE) { 2908 if (idle == CPU_NEWLY_IDLE) {
2909
2910 /*
2911 * ASYM_PACKING needs to force migrate tasks from busy but
2912 * higher numbered CPUs in order to pack all tasks in the
2913 * lowest numbered CPUs.
2914 */
2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2916 return 1;
2917
2783 /* 2918 /*
2784 * The only task running in a non-idle cpu can be moved to this 2919 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2920 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2989,7 @@ redo:
2854 goto out_balanced; 2989 goto out_balanced;
2855 } 2990 }
2856 2991
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2992 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2993 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2994 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2995 goto out_balanced;
@@ -2898,7 +3033,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3033 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3034 sd->nr_balance_failed++;
2900 3035
2901 if (need_active_balance(sd, sd_idle, idle)) { 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3038 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3039
2904 /* don't kick the active_load_balance_cpu_stop, 3040 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3229,40 @@ out_unlock:
3093} 3229}
3094 3230
3095#ifdef CONFIG_NO_HZ 3231#ifdef CONFIG_NO_HZ
3232
3233static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3234
3235static void trigger_sched_softirq(void *data)
3236{
3237 raise_softirq_irqoff(SCHED_SOFTIRQ);
3238}
3239
3240static inline void init_sched_softirq_csd(struct call_single_data *csd)
3241{
3242 csd->func = trigger_sched_softirq;
3243 csd->info = NULL;
3244 csd->flags = 0;
3245 csd->priv = 0;
3246}
3247
3248/*
3249 * idle load balancing details
3250 * - One of the idle CPUs nominates itself as idle load_balancer, while
3251 * entering idle.
3252 * - This idle load balancer CPU will also go into tickless mode when
3253 * it is idle, just like all other idle CPUs
3254 * - When one of the busy CPUs notice that there may be an idle rebalancing
3255 * needed, they will kick the idle load balancer, which then does idle
3256 * load balancing for all the idle CPUs.
3257 */
3096static struct { 3258static struct {
3097 atomic_t load_balancer; 3259 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3260 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3261 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3262 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3263 cpumask_var_t grp_idle_mask;
3102}; 3264 unsigned long next_balance; /* in jiffy units */
3265} nohz ____cacheline_aligned;
3103 3266
3104int get_nohz_load_balancer(void) 3267int get_nohz_load_balancer(void)
3105{ 3268{
@@ -3153,17 +3316,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3316 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3317static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3318{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3319 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3320 sched_group_cpus(ilb_group));
3158 3321
3159 /* 3322 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3323 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3324 * and atleast one idle cpu.
3162 */ 3325 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3326 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3327 return 0;
3165 3328
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3329 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3330 return 0;
3168 3331
3169 return 1; 3332 return 1;
@@ -3196,7 +3359,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3359 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3360 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3361 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3363 goto out_done;
3201 3364
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3367,7 @@ static int find_new_ilb(int cpu)
3204 3367
3205 do { 3368 do {
3206 if (is_semi_idle_group(ilb_group)) 3369 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3370 return cpumask_first(nohz.grp_idle_mask);
3208 3371
3209 ilb_group = ilb_group->next; 3372 ilb_group = ilb_group->next;
3210 3373
@@ -3212,98 +3375,116 @@ static int find_new_ilb(int cpu)
3212 } 3375 }
3213 3376
3214out_done: 3377out_done:
3215 return cpumask_first(nohz.cpu_mask); 3378 return nr_cpu_ids;
3216} 3379}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3381static inline int find_new_ilb(int call_cpu)
3219{ 3382{
3220 return cpumask_first(nohz.cpu_mask); 3383 return nr_cpu_ids;
3221} 3384}
3222#endif 3385#endif
3223 3386
3224/* 3387/*
3388 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3389 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3390 * CPU (if there is one).
3391 */
3392static void nohz_balancer_kick(int cpu)
3393{
3394 int ilb_cpu;
3395
3396 nohz.next_balance++;
3397
3398 ilb_cpu = get_nohz_load_balancer();
3399
3400 if (ilb_cpu >= nr_cpu_ids) {
3401 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3402 if (ilb_cpu >= nr_cpu_ids)
3403 return;
3404 }
3405
3406 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3407 struct call_single_data *cp;
3408
3409 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3410 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3411 __smp_call_function_single(ilb_cpu, cp, 0);
3412 }
3413 return;
3414}
3415
3416/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3417 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3418 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3419 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3420 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3421 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3422 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3423 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3424 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3425 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3426 * ilb owner CPU in future (when there is a need for idle load balancing on
3427 * behalf of all idle CPUs).
3243 */ 3428 */
3244int select_nohz_load_balancer(int stop_tick) 3429void select_nohz_load_balancer(int stop_tick)
3245{ 3430{
3246 int cpu = smp_processor_id(); 3431 int cpu = smp_processor_id();
3247 3432
3248 if (stop_tick) { 3433 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3434 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3435 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3436 return;
3254 3437
3255 /* 3438 /*
3256 * If we are going offline and still the leader, 3439 * If we are going offline and still the leader,
3257 * give up! 3440 * give up!
3258 */ 3441 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3442 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3443 nr_cpu_ids) != cpu)
3260 BUG(); 3444 BUG();
3261 3445
3262 return 0; 3446 return;
3263 } 3447 }
3264 3448
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3449 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3450
3267 /* time for ilb owner also to sleep */ 3451 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3452 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3453 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3454 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3455
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3456 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3457 int new_ilb;
3280 3458
3281 if (!(sched_smt_power_savings || 3459 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3460 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3461 cpu) != nr_cpu_ids)
3462 return;
3463
3284 /* 3464 /*
3285 * Check to see if there is a more power-efficient 3465 * Check to see if there is a more power-efficient
3286 * ilb. 3466 * ilb.
3287 */ 3467 */
3288 new_ilb = find_new_ilb(cpu); 3468 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3469 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3470 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3471 resched_cpu(new_ilb);
3292 return 0; 3472 return;
3293 } 3473 }
3294 return 1; 3474 return;
3295 } 3475 }
3296 } else { 3476 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3477 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3478 return;
3299 3479
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3480 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3481
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3482 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3483 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3484 nr_cpu_ids) != cpu)
3304 BUG(); 3485 BUG();
3305 } 3486 }
3306 return 0; 3487 return;
3307} 3488}
3308#endif 3489#endif
3309 3490
@@ -3385,11 +3566,102 @@ out:
3385 rq->next_balance = next_balance; 3566 rq->next_balance = next_balance;
3386} 3567}
3387 3568
3569#ifdef CONFIG_NO_HZ
3388/* 3570/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3571 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3572 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3573 */
3574static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3575{
3576 struct rq *this_rq = cpu_rq(this_cpu);
3577 struct rq *rq;
3578 int balance_cpu;
3579
3580 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3581 return;
3582
3583 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3584 if (balance_cpu == this_cpu)
3585 continue;
3586
3587 /*
3588 * If this cpu gets work to do, stop the load balancing
3589 * work being done for other cpus. Next load
3590 * balancing owner will pick it up.
3591 */
3592 if (need_resched()) {
3593 this_rq->nohz_balance_kick = 0;
3594 break;
3595 }
3596
3597 raw_spin_lock_irq(&this_rq->lock);
3598 update_rq_clock(this_rq);
3599 update_cpu_load(this_rq);
3600 raw_spin_unlock_irq(&this_rq->lock);
3601
3602 rebalance_domains(balance_cpu, CPU_IDLE);
3603
3604 rq = cpu_rq(balance_cpu);
3605 if (time_after(this_rq->next_balance, rq->next_balance))
3606 this_rq->next_balance = rq->next_balance;
3607 }
3608 nohz.next_balance = this_rq->next_balance;
3609 this_rq->nohz_balance_kick = 0;
3610}
3611
3612/*
3613 * Current heuristic for kicking the idle load balancer
3614 * - first_pick_cpu is the one of the busy CPUs. It will kick
3615 * idle load balancer when it has more than one process active. This
3616 * eliminates the need for idle load balancing altogether when we have
3617 * only one running process in the system (common case).
3618 * - If there are more than one busy CPU, idle load balancer may have
3619 * to run for active_load_balance to happen (i.e., two busy CPUs are
3620 * SMT or core siblings and can run better if they move to different
3621 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3622 * which will kick idle load balancer as soon as it has any load.
3623 */
3624static inline int nohz_kick_needed(struct rq *rq, int cpu)
3625{
3626 unsigned long now = jiffies;
3627 int ret;
3628 int first_pick_cpu, second_pick_cpu;
3629
3630 if (time_before(now, nohz.next_balance))
3631 return 0;
3632
3633 if (rq->idle_at_tick)
3634 return 0;
3635
3636 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3637 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3638
3639 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3640 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3641 return 0;
3642
3643 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3644 if (ret == nr_cpu_ids || ret == cpu) {
3645 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3646 if (rq->nr_running > 1)
3647 return 1;
3648 } else {
3649 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3650 if (ret == nr_cpu_ids || ret == cpu) {
3651 if (rq->nr_running)
3652 return 1;
3653 }
3654 }
3655 return 0;
3656}
3657#else
3658static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3659#endif
3660
3661/*
3662 * run_rebalance_domains is triggered when needed from the scheduler tick.
3663 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3664 */
3393static void run_rebalance_domains(struct softirq_action *h) 3665static void run_rebalance_domains(struct softirq_action *h)
3394{ 3666{
3395 int this_cpu = smp_processor_id(); 3667 int this_cpu = smp_processor_id();
@@ -3399,37 +3671,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3671
3400 rebalance_domains(this_cpu, idle); 3672 rebalance_domains(this_cpu, idle);
3401 3673
3402#ifdef CONFIG_NO_HZ
3403 /* 3674 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3675 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3676 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3677 * stopped.
3407 */ 3678 */
3408 if (this_rq->idle_at_tick && 3679 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3680}
3434 3681
3435static inline int on_null_domain(int cpu) 3682static inline int on_null_domain(int cpu)
@@ -3439,57 +3686,17 @@ static inline int on_null_domain(int cpu)
3439 3686
3440/* 3687/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3688 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3689 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3690static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3691{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3692 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3693 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3694 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3695 raise_softirq(SCHED_SOFTIRQ);
3696#ifdef CONFIG_NO_HZ
3697 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3698 nohz_balancer_kick(cpu);
3699#endif
3493} 3700}
3494 3701
3495static void rq_online_fair(struct rq *rq) 3702static void rq_online_fair(struct rq *rq)
@@ -3542,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p)
3542 3749
3543 raw_spin_lock_irqsave(&rq->lock, flags); 3750 raw_spin_lock_irqsave(&rq->lock, flags);
3544 3751
3752 update_rq_clock(rq);
3753
3545 if (unlikely(task_cpu(p) != this_cpu)) 3754 if (unlikely(task_cpu(p) != this_cpu))
3546 __set_task_cpu(p, this_cpu); 3755 __set_task_cpu(p, this_cpu);
3547 3756