diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 545 |
1 files changed, 377 insertions, 168 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..db3f674ca49d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 56 | */ |
57 | unsigned int sysctl_sched_min_granularity = 2000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
62 | */ | 62 | */ |
63 | static unsigned int sched_nr_latency = 3; | 63 | static unsigned int sched_nr_latency = 8; |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * After fork, child runs first. If set to 0 (default) then | 66 | * After fork, child runs first. If set to 0 (default) then |
@@ -1313,7 +1313,7 @@ static struct sched_group * | |||
1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
1314 | int this_cpu, int load_idx) | 1314 | int this_cpu, int load_idx) |
1315 | { | 1315 | { |
1316 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1316 | struct sched_group *idlest = NULL, *group = sd->groups; |
1317 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1317 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1319 | 1319 | ||
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1348 | 1348 | ||
1349 | if (local_group) { | 1349 | if (local_group) { |
1350 | this_load = avg_load; | 1350 | this_load = avg_load; |
1351 | this = group; | ||
1352 | } else if (avg_load < min_load) { | 1351 | } else if (avg_load < min_load) { |
1353 | min_load = avg_load; | 1352 | min_load = avg_load; |
1354 | idlest = group; | 1353 | idlest = group; |
@@ -2268,8 +2267,6 @@ unsigned long scale_rt_power(int cpu) | |||
2268 | struct rq *rq = cpu_rq(cpu); | 2267 | struct rq *rq = cpu_rq(cpu); |
2269 | u64 total, available; | 2268 | u64 total, available; |
2270 | 2269 | ||
2271 | sched_avg_update(rq); | ||
2272 | |||
2273 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2270 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
2274 | available = total - rq->rt_avg; | 2271 | available = total - rq->rt_avg; |
2275 | 2272 | ||
@@ -2287,13 +2284,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2287 | unsigned long power = SCHED_LOAD_SCALE; | 2284 | unsigned long power = SCHED_LOAD_SCALE; |
2288 | struct sched_group *sdg = sd->groups; | 2285 | struct sched_group *sdg = sd->groups; |
2289 | 2286 | ||
2290 | if (sched_feat(ARCH_POWER)) | ||
2291 | power *= arch_scale_freq_power(sd, cpu); | ||
2292 | else | ||
2293 | power *= default_scale_freq_power(sd, cpu); | ||
2294 | |||
2295 | power >>= SCHED_LOAD_SHIFT; | ||
2296 | |||
2297 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2287 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
2298 | if (sched_feat(ARCH_POWER)) | 2288 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_smt_power(sd, cpu); | 2289 | power *= arch_scale_smt_power(sd, cpu); |
@@ -2303,6 +2293,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2303 | power >>= SCHED_LOAD_SHIFT; | 2293 | power >>= SCHED_LOAD_SHIFT; |
2304 | } | 2294 | } |
2305 | 2295 | ||
2296 | sdg->cpu_power_orig = power; | ||
2297 | |||
2298 | if (sched_feat(ARCH_POWER)) | ||
2299 | power *= arch_scale_freq_power(sd, cpu); | ||
2300 | else | ||
2301 | power *= default_scale_freq_power(sd, cpu); | ||
2302 | |||
2303 | power >>= SCHED_LOAD_SHIFT; | ||
2304 | |||
2306 | power *= scale_rt_power(cpu); | 2305 | power *= scale_rt_power(cpu); |
2307 | power >>= SCHED_LOAD_SHIFT; | 2306 | power >>= SCHED_LOAD_SHIFT; |
2308 | 2307 | ||
@@ -2335,6 +2334,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2335 | sdg->cpu_power = power; | 2334 | sdg->cpu_power = power; |
2336 | } | 2335 | } |
2337 | 2336 | ||
2337 | /* | ||
2338 | * Try and fix up capacity for tiny siblings, this is needed when | ||
2339 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | ||
2340 | * which on its own isn't powerful enough. | ||
2341 | * | ||
2342 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
2343 | */ | ||
2344 | static inline int | ||
2345 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | ||
2346 | { | ||
2347 | /* | ||
2348 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | ||
2349 | */ | ||
2350 | if (sd->level != SD_LV_SIBLING) | ||
2351 | return 0; | ||
2352 | |||
2353 | /* | ||
2354 | * If ~90% of the cpu_power is still there, we're good. | ||
2355 | */ | ||
2356 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | ||
2357 | return 1; | ||
2358 | |||
2359 | return 0; | ||
2360 | } | ||
2361 | |||
2338 | /** | 2362 | /** |
2339 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 2363 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
2340 | * @sd: The sched_domain whose statistics are to be updated. | 2364 | * @sd: The sched_domain whose statistics are to be updated. |
@@ -2400,14 +2424,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2400 | * domains. In the newly idle case, we will allow all the cpu's | 2424 | * domains. In the newly idle case, we will allow all the cpu's |
2401 | * to do the newly idle load balance. | 2425 | * to do the newly idle load balance. |
2402 | */ | 2426 | */ |
2403 | if (idle != CPU_NEWLY_IDLE && local_group && | 2427 | if (idle != CPU_NEWLY_IDLE && local_group) { |
2404 | balance_cpu != this_cpu) { | 2428 | if (balance_cpu != this_cpu) { |
2405 | *balance = 0; | 2429 | *balance = 0; |
2406 | return; | 2430 | return; |
2431 | } | ||
2432 | update_group_power(sd, this_cpu); | ||
2407 | } | 2433 | } |
2408 | 2434 | ||
2409 | update_group_power(sd, this_cpu); | ||
2410 | |||
2411 | /* Adjust by relative CPU power of the group */ | 2435 | /* Adjust by relative CPU power of the group */ |
2412 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2436 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2413 | 2437 | ||
@@ -2428,6 +2452,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2428 | 2452 | ||
2429 | sgs->group_capacity = | 2453 | sgs->group_capacity = |
2430 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2454 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
2455 | if (!sgs->group_capacity) | ||
2456 | sgs->group_capacity = fix_small_capacity(sd, group); | ||
2457 | } | ||
2458 | |||
2459 | /** | ||
2460 | * update_sd_pick_busiest - return 1 on busiest group | ||
2461 | * @sd: sched_domain whose statistics are to be checked | ||
2462 | * @sds: sched_domain statistics | ||
2463 | * @sg: sched_group candidate to be checked for being the busiest | ||
2464 | * @sgs: sched_group statistics | ||
2465 | * @this_cpu: the current cpu | ||
2466 | * | ||
2467 | * Determine if @sg is a busier group than the previously selected | ||
2468 | * busiest group. | ||
2469 | */ | ||
2470 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
2471 | struct sd_lb_stats *sds, | ||
2472 | struct sched_group *sg, | ||
2473 | struct sg_lb_stats *sgs, | ||
2474 | int this_cpu) | ||
2475 | { | ||
2476 | if (sgs->avg_load <= sds->max_load) | ||
2477 | return false; | ||
2478 | |||
2479 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
2480 | return true; | ||
2481 | |||
2482 | if (sgs->group_imb) | ||
2483 | return true; | ||
2484 | |||
2485 | /* | ||
2486 | * ASYM_PACKING needs to move all the work to the lowest | ||
2487 | * numbered CPUs in the group, therefore mark all groups | ||
2488 | * higher than ourself as busy. | ||
2489 | */ | ||
2490 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
2491 | this_cpu < group_first_cpu(sg)) { | ||
2492 | if (!sds->busiest) | ||
2493 | return true; | ||
2494 | |||
2495 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
2496 | return true; | ||
2497 | } | ||
2498 | |||
2499 | return false; | ||
2431 | } | 2500 | } |
2432 | 2501 | ||
2433 | /** | 2502 | /** |
@@ -2435,7 +2504,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2435 | * @sd: sched_domain whose statistics are to be updated. | 2504 | * @sd: sched_domain whose statistics are to be updated. |
2436 | * @this_cpu: Cpu for which load balance is currently performed. | 2505 | * @this_cpu: Cpu for which load balance is currently performed. |
2437 | * @idle: Idle status of this_cpu | 2506 | * @idle: Idle status of this_cpu |
2438 | * @sd_idle: Idle status of the sched_domain containing group. | 2507 | * @sd_idle: Idle status of the sched_domain containing sg. |
2439 | * @cpus: Set of cpus considered for load balancing. | 2508 | * @cpus: Set of cpus considered for load balancing. |
2440 | * @balance: Should we balance. | 2509 | * @balance: Should we balance. |
2441 | * @sds: variable to hold the statistics for this sched_domain. | 2510 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -2446,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2446 | struct sd_lb_stats *sds) | 2515 | struct sd_lb_stats *sds) |
2447 | { | 2516 | { |
2448 | struct sched_domain *child = sd->child; | 2517 | struct sched_domain *child = sd->child; |
2449 | struct sched_group *group = sd->groups; | 2518 | struct sched_group *sg = sd->groups; |
2450 | struct sg_lb_stats sgs; | 2519 | struct sg_lb_stats sgs; |
2451 | int load_idx, prefer_sibling = 0; | 2520 | int load_idx, prefer_sibling = 0; |
2452 | 2521 | ||
@@ -2459,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2459 | do { | 2528 | do { |
2460 | int local_group; | 2529 | int local_group; |
2461 | 2530 | ||
2462 | local_group = cpumask_test_cpu(this_cpu, | 2531 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2463 | sched_group_cpus(group)); | ||
2464 | memset(&sgs, 0, sizeof(sgs)); | 2532 | memset(&sgs, 0, sizeof(sgs)); |
2465 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2533 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
2466 | local_group, cpus, balance, &sgs); | 2534 | local_group, cpus, balance, &sgs); |
2467 | 2535 | ||
2468 | if (local_group && !(*balance)) | 2536 | if (local_group && !(*balance)) |
2469 | return; | 2537 | return; |
2470 | 2538 | ||
2471 | sds->total_load += sgs.group_load; | 2539 | sds->total_load += sgs.group_load; |
2472 | sds->total_pwr += group->cpu_power; | 2540 | sds->total_pwr += sg->cpu_power; |
2473 | 2541 | ||
2474 | /* | 2542 | /* |
2475 | * In case the child domain prefers tasks go to siblings | 2543 | * In case the child domain prefers tasks go to siblings |
2476 | * first, lower the group capacity to one so that we'll try | 2544 | * first, lower the sg capacity to one so that we'll try |
2477 | * and move all the excess tasks away. | 2545 | * and move all the excess tasks away. |
2478 | */ | 2546 | */ |
2479 | if (prefer_sibling) | 2547 | if (prefer_sibling) |
@@ -2481,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2481 | 2549 | ||
2482 | if (local_group) { | 2550 | if (local_group) { |
2483 | sds->this_load = sgs.avg_load; | 2551 | sds->this_load = sgs.avg_load; |
2484 | sds->this = group; | 2552 | sds->this = sg; |
2485 | sds->this_nr_running = sgs.sum_nr_running; | 2553 | sds->this_nr_running = sgs.sum_nr_running; |
2486 | sds->this_load_per_task = sgs.sum_weighted_load; | 2554 | sds->this_load_per_task = sgs.sum_weighted_load; |
2487 | } else if (sgs.avg_load > sds->max_load && | 2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2488 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2489 | sgs.group_imb)) { | ||
2490 | sds->max_load = sgs.avg_load; | 2556 | sds->max_load = sgs.avg_load; |
2491 | sds->busiest = group; | 2557 | sds->busiest = sg; |
2492 | sds->busiest_nr_running = sgs.sum_nr_running; | 2558 | sds->busiest_nr_running = sgs.sum_nr_running; |
2493 | sds->busiest_group_capacity = sgs.group_capacity; | 2559 | sds->busiest_group_capacity = sgs.group_capacity; |
2494 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2495 | sds->group_imb = sgs.group_imb; | 2561 | sds->group_imb = sgs.group_imb; |
2496 | } | 2562 | } |
2497 | 2563 | ||
2498 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2564 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
2499 | group = group->next; | 2565 | sg = sg->next; |
2500 | } while (group != sd->groups); | 2566 | } while (sg != sd->groups); |
2567 | } | ||
2568 | |||
2569 | int __weak arch_sd_sibling_asym_packing(void) | ||
2570 | { | ||
2571 | return 0*SD_ASYM_PACKING; | ||
2572 | } | ||
2573 | |||
2574 | /** | ||
2575 | * check_asym_packing - Check to see if the group is packed into the | ||
2576 | * sched doman. | ||
2577 | * | ||
2578 | * This is primarily intended to used at the sibling level. Some | ||
2579 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
2580 | * case of POWER7, it can move to lower SMT modes only when higher | ||
2581 | * threads are idle. When in lower SMT modes, the threads will | ||
2582 | * perform better since they share less core resources. Hence when we | ||
2583 | * have idle threads, we want them to be the higher ones. | ||
2584 | * | ||
2585 | * This packing function is run on idle threads. It checks to see if | ||
2586 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
2587 | * CPU number than the packing function is being run on. Here we are | ||
2588 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
2589 | * number. | ||
2590 | * | ||
2591 | * Returns 1 when packing is required and a task should be moved to | ||
2592 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
2593 | * | ||
2594 | * @sd: The sched_domain whose packing is to be checked. | ||
2595 | * @sds: Statistics of the sched_domain which is to be packed | ||
2596 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2597 | * @imbalance: returns amount of imbalanced due to packing. | ||
2598 | */ | ||
2599 | static int check_asym_packing(struct sched_domain *sd, | ||
2600 | struct sd_lb_stats *sds, | ||
2601 | int this_cpu, unsigned long *imbalance) | ||
2602 | { | ||
2603 | int busiest_cpu; | ||
2604 | |||
2605 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
2606 | return 0; | ||
2607 | |||
2608 | if (!sds->busiest) | ||
2609 | return 0; | ||
2610 | |||
2611 | busiest_cpu = group_first_cpu(sds->busiest); | ||
2612 | if (this_cpu > busiest_cpu) | ||
2613 | return 0; | ||
2614 | |||
2615 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
2616 | SCHED_LOAD_SCALE); | ||
2617 | return 1; | ||
2501 | } | 2618 | } |
2502 | 2619 | ||
2503 | /** | 2620 | /** |
@@ -2692,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2692 | if (!(*balance)) | 2809 | if (!(*balance)) |
2693 | goto ret; | 2810 | goto ret; |
2694 | 2811 | ||
2812 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
2813 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
2814 | return sds.busiest; | ||
2815 | |||
2695 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2816 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2696 | goto out_balanced; | 2817 | goto out_balanced; |
2697 | 2818 | ||
@@ -2726,8 +2847,9 @@ ret: | |||
2726 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2847 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2727 | */ | 2848 | */ |
2728 | static struct rq * | 2849 | static struct rq * |
2729 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2850 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, |
2730 | unsigned long imbalance, const struct cpumask *cpus) | 2851 | enum cpu_idle_type idle, unsigned long imbalance, |
2852 | const struct cpumask *cpus) | ||
2731 | { | 2853 | { |
2732 | struct rq *busiest = NULL, *rq; | 2854 | struct rq *busiest = NULL, *rq; |
2733 | unsigned long max_load = 0; | 2855 | unsigned long max_load = 0; |
@@ -2738,6 +2860,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2738 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 2860 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); |
2739 | unsigned long wl; | 2861 | unsigned long wl; |
2740 | 2862 | ||
2863 | if (!capacity) | ||
2864 | capacity = fix_small_capacity(sd, group); | ||
2865 | |||
2741 | if (!cpumask_test_cpu(i, cpus)) | 2866 | if (!cpumask_test_cpu(i, cpus)) |
2742 | continue; | 2867 | continue; |
2743 | 2868 | ||
@@ -2777,9 +2902,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2777 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2902 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2778 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2903 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2779 | 2904 | ||
2780 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2905 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
2906 | int busiest_cpu, int this_cpu) | ||
2781 | { | 2907 | { |
2782 | if (idle == CPU_NEWLY_IDLE) { | 2908 | if (idle == CPU_NEWLY_IDLE) { |
2909 | |||
2910 | /* | ||
2911 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
2912 | * higher numbered CPUs in order to pack all tasks in the | ||
2913 | * lowest numbered CPUs. | ||
2914 | */ | ||
2915 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
2916 | return 1; | ||
2917 | |||
2783 | /* | 2918 | /* |
2784 | * The only task running in a non-idle cpu can be moved to this | 2919 | * The only task running in a non-idle cpu can be moved to this |
2785 | * cpu in an attempt to completely freeup the other CPU | 2920 | * cpu in an attempt to completely freeup the other CPU |
@@ -2854,7 +2989,7 @@ redo: | |||
2854 | goto out_balanced; | 2989 | goto out_balanced; |
2855 | } | 2990 | } |
2856 | 2991 | ||
2857 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | 2992 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); |
2858 | if (!busiest) { | 2993 | if (!busiest) { |
2859 | schedstat_inc(sd, lb_nobusyq[idle]); | 2994 | schedstat_inc(sd, lb_nobusyq[idle]); |
2860 | goto out_balanced; | 2995 | goto out_balanced; |
@@ -2898,7 +3033,8 @@ redo: | |||
2898 | schedstat_inc(sd, lb_failed[idle]); | 3033 | schedstat_inc(sd, lb_failed[idle]); |
2899 | sd->nr_balance_failed++; | 3034 | sd->nr_balance_failed++; |
2900 | 3035 | ||
2901 | if (need_active_balance(sd, sd_idle, idle)) { | 3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
3037 | this_cpu)) { | ||
2902 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3038 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2903 | 3039 | ||
2904 | /* don't kick the active_load_balance_cpu_stop, | 3040 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3093,13 +3229,40 @@ out_unlock: | |||
3093 | } | 3229 | } |
3094 | 3230 | ||
3095 | #ifdef CONFIG_NO_HZ | 3231 | #ifdef CONFIG_NO_HZ |
3232 | |||
3233 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3234 | |||
3235 | static void trigger_sched_softirq(void *data) | ||
3236 | { | ||
3237 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3238 | } | ||
3239 | |||
3240 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3241 | { | ||
3242 | csd->func = trigger_sched_softirq; | ||
3243 | csd->info = NULL; | ||
3244 | csd->flags = 0; | ||
3245 | csd->priv = 0; | ||
3246 | } | ||
3247 | |||
3248 | /* | ||
3249 | * idle load balancing details | ||
3250 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3251 | * entering idle. | ||
3252 | * - This idle load balancer CPU will also go into tickless mode when | ||
3253 | * it is idle, just like all other idle CPUs | ||
3254 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
3255 | * needed, they will kick the idle load balancer, which then does idle | ||
3256 | * load balancing for all the idle CPUs. | ||
3257 | */ | ||
3096 | static struct { | 3258 | static struct { |
3097 | atomic_t load_balancer; | 3259 | atomic_t load_balancer; |
3098 | cpumask_var_t cpu_mask; | 3260 | atomic_t first_pick_cpu; |
3099 | cpumask_var_t ilb_grp_nohz_mask; | 3261 | atomic_t second_pick_cpu; |
3100 | } nohz ____cacheline_aligned = { | 3262 | cpumask_var_t idle_cpus_mask; |
3101 | .load_balancer = ATOMIC_INIT(-1), | 3263 | cpumask_var_t grp_idle_mask; |
3102 | }; | 3264 | unsigned long next_balance; /* in jiffy units */ |
3265 | } nohz ____cacheline_aligned; | ||
3103 | 3266 | ||
3104 | int get_nohz_load_balancer(void) | 3267 | int get_nohz_load_balancer(void) |
3105 | { | 3268 | { |
@@ -3153,17 +3316,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3153 | */ | 3316 | */ |
3154 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3317 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
3155 | { | 3318 | { |
3156 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3319 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
3157 | sched_group_cpus(ilb_group)); | 3320 | sched_group_cpus(ilb_group)); |
3158 | 3321 | ||
3159 | /* | 3322 | /* |
3160 | * A sched_group is semi-idle when it has atleast one busy cpu | 3323 | * A sched_group is semi-idle when it has atleast one busy cpu |
3161 | * and atleast one idle cpu. | 3324 | * and atleast one idle cpu. |
3162 | */ | 3325 | */ |
3163 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3326 | if (cpumask_empty(nohz.grp_idle_mask)) |
3164 | return 0; | 3327 | return 0; |
3165 | 3328 | ||
3166 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3329 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
3167 | return 0; | 3330 | return 0; |
3168 | 3331 | ||
3169 | return 1; | 3332 | return 1; |
@@ -3196,7 +3359,7 @@ static int find_new_ilb(int cpu) | |||
3196 | * Optimize for the case when we have no idle CPUs or only one | 3359 | * Optimize for the case when we have no idle CPUs or only one |
3197 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3360 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
3198 | */ | 3361 | */ |
3199 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3362 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3200 | goto out_done; | 3363 | goto out_done; |
3201 | 3364 | ||
3202 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3365 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
@@ -3204,7 +3367,7 @@ static int find_new_ilb(int cpu) | |||
3204 | 3367 | ||
3205 | do { | 3368 | do { |
3206 | if (is_semi_idle_group(ilb_group)) | 3369 | if (is_semi_idle_group(ilb_group)) |
3207 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3370 | return cpumask_first(nohz.grp_idle_mask); |
3208 | 3371 | ||
3209 | ilb_group = ilb_group->next; | 3372 | ilb_group = ilb_group->next; |
3210 | 3373 | ||
@@ -3212,98 +3375,116 @@ static int find_new_ilb(int cpu) | |||
3212 | } | 3375 | } |
3213 | 3376 | ||
3214 | out_done: | 3377 | out_done: |
3215 | return cpumask_first(nohz.cpu_mask); | 3378 | return nr_cpu_ids; |
3216 | } | 3379 | } |
3217 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3380 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3218 | static inline int find_new_ilb(int call_cpu) | 3381 | static inline int find_new_ilb(int call_cpu) |
3219 | { | 3382 | { |
3220 | return cpumask_first(nohz.cpu_mask); | 3383 | return nr_cpu_ids; |
3221 | } | 3384 | } |
3222 | #endif | 3385 | #endif |
3223 | 3386 | ||
3224 | /* | 3387 | /* |
3388 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
3389 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
3390 | * CPU (if there is one). | ||
3391 | */ | ||
3392 | static void nohz_balancer_kick(int cpu) | ||
3393 | { | ||
3394 | int ilb_cpu; | ||
3395 | |||
3396 | nohz.next_balance++; | ||
3397 | |||
3398 | ilb_cpu = get_nohz_load_balancer(); | ||
3399 | |||
3400 | if (ilb_cpu >= nr_cpu_ids) { | ||
3401 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3402 | if (ilb_cpu >= nr_cpu_ids) | ||
3403 | return; | ||
3404 | } | ||
3405 | |||
3406 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
3407 | struct call_single_data *cp; | ||
3408 | |||
3409 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
3410 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
3411 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
3412 | } | ||
3413 | return; | ||
3414 | } | ||
3415 | |||
3416 | /* | ||
3225 | * This routine will try to nominate the ilb (idle load balancing) | 3417 | * This routine will try to nominate the ilb (idle load balancing) |
3226 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3418 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3227 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3419 | * load balancing on behalf of all those cpus. |
3228 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3229 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3230 | * arrives... | ||
3231 | * | ||
3232 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3233 | * for idle load balancing. ilb owner will still be part of | ||
3234 | * nohz.cpu_mask.. | ||
3235 | * | 3420 | * |
3236 | * While stopping the tick, this cpu will become the ilb owner if there | 3421 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
3237 | * is no other owner. And will be the owner till that cpu becomes busy | 3422 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
3238 | * or if all cpus in the system stop their ticks at which point | 3423 | * idle load balancing by kicking one of the idle CPUs. |
3239 | * there is no need for ilb owner. | ||
3240 | * | 3424 | * |
3241 | * When the ilb owner becomes busy, it nominates another owner, during the | 3425 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
3242 | * next busy scheduler_tick() | 3426 | * ilb owner CPU in future (when there is a need for idle load balancing on |
3427 | * behalf of all idle CPUs). | ||
3243 | */ | 3428 | */ |
3244 | int select_nohz_load_balancer(int stop_tick) | 3429 | void select_nohz_load_balancer(int stop_tick) |
3245 | { | 3430 | { |
3246 | int cpu = smp_processor_id(); | 3431 | int cpu = smp_processor_id(); |
3247 | 3432 | ||
3248 | if (stop_tick) { | 3433 | if (stop_tick) { |
3249 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3250 | |||
3251 | if (!cpu_active(cpu)) { | 3434 | if (!cpu_active(cpu)) { |
3252 | if (atomic_read(&nohz.load_balancer) != cpu) | 3435 | if (atomic_read(&nohz.load_balancer) != cpu) |
3253 | return 0; | 3436 | return; |
3254 | 3437 | ||
3255 | /* | 3438 | /* |
3256 | * If we are going offline and still the leader, | 3439 | * If we are going offline and still the leader, |
3257 | * give up! | 3440 | * give up! |
3258 | */ | 3441 | */ |
3259 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3442 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3443 | nr_cpu_ids) != cpu) | ||
3260 | BUG(); | 3444 | BUG(); |
3261 | 3445 | ||
3262 | return 0; | 3446 | return; |
3263 | } | 3447 | } |
3264 | 3448 | ||
3265 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3449 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3266 | 3450 | ||
3267 | /* time for ilb owner also to sleep */ | 3451 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
3268 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3452 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
3269 | if (atomic_read(&nohz.load_balancer) == cpu) | 3453 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
3270 | atomic_set(&nohz.load_balancer, -1); | 3454 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
3271 | return 0; | ||
3272 | } | ||
3273 | 3455 | ||
3274 | if (atomic_read(&nohz.load_balancer) == -1) { | 3456 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
3275 | /* make me the ilb owner */ | ||
3276 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3277 | return 1; | ||
3278 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3279 | int new_ilb; | 3457 | int new_ilb; |
3280 | 3458 | ||
3281 | if (!(sched_smt_power_savings || | 3459 | /* make me the ilb owner */ |
3282 | sched_mc_power_savings)) | 3460 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
3283 | return 1; | 3461 | cpu) != nr_cpu_ids) |
3462 | return; | ||
3463 | |||
3284 | /* | 3464 | /* |
3285 | * Check to see if there is a more power-efficient | 3465 | * Check to see if there is a more power-efficient |
3286 | * ilb. | 3466 | * ilb. |
3287 | */ | 3467 | */ |
3288 | new_ilb = find_new_ilb(cpu); | 3468 | new_ilb = find_new_ilb(cpu); |
3289 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3469 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
3290 | atomic_set(&nohz.load_balancer, -1); | 3470 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
3291 | resched_cpu(new_ilb); | 3471 | resched_cpu(new_ilb); |
3292 | return 0; | 3472 | return; |
3293 | } | 3473 | } |
3294 | return 1; | 3474 | return; |
3295 | } | 3475 | } |
3296 | } else { | 3476 | } else { |
3297 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3477 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
3298 | return 0; | 3478 | return; |
3299 | 3479 | ||
3300 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3480 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
3301 | 3481 | ||
3302 | if (atomic_read(&nohz.load_balancer) == cpu) | 3482 | if (atomic_read(&nohz.load_balancer) == cpu) |
3303 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3483 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3484 | nr_cpu_ids) != cpu) | ||
3304 | BUG(); | 3485 | BUG(); |
3305 | } | 3486 | } |
3306 | return 0; | 3487 | return; |
3307 | } | 3488 | } |
3308 | #endif | 3489 | #endif |
3309 | 3490 | ||
@@ -3385,11 +3566,102 @@ out: | |||
3385 | rq->next_balance = next_balance; | 3566 | rq->next_balance = next_balance; |
3386 | } | 3567 | } |
3387 | 3568 | ||
3569 | #ifdef CONFIG_NO_HZ | ||
3388 | /* | 3570 | /* |
3389 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3571 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
3390 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3391 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3572 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3392 | */ | 3573 | */ |
3574 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
3575 | { | ||
3576 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3577 | struct rq *rq; | ||
3578 | int balance_cpu; | ||
3579 | |||
3580 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
3581 | return; | ||
3582 | |||
3583 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
3584 | if (balance_cpu == this_cpu) | ||
3585 | continue; | ||
3586 | |||
3587 | /* | ||
3588 | * If this cpu gets work to do, stop the load balancing | ||
3589 | * work being done for other cpus. Next load | ||
3590 | * balancing owner will pick it up. | ||
3591 | */ | ||
3592 | if (need_resched()) { | ||
3593 | this_rq->nohz_balance_kick = 0; | ||
3594 | break; | ||
3595 | } | ||
3596 | |||
3597 | raw_spin_lock_irq(&this_rq->lock); | ||
3598 | update_rq_clock(this_rq); | ||
3599 | update_cpu_load(this_rq); | ||
3600 | raw_spin_unlock_irq(&this_rq->lock); | ||
3601 | |||
3602 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3603 | |||
3604 | rq = cpu_rq(balance_cpu); | ||
3605 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3606 | this_rq->next_balance = rq->next_balance; | ||
3607 | } | ||
3608 | nohz.next_balance = this_rq->next_balance; | ||
3609 | this_rq->nohz_balance_kick = 0; | ||
3610 | } | ||
3611 | |||
3612 | /* | ||
3613 | * Current heuristic for kicking the idle load balancer | ||
3614 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
3615 | * idle load balancer when it has more than one process active. This | ||
3616 | * eliminates the need for idle load balancing altogether when we have | ||
3617 | * only one running process in the system (common case). | ||
3618 | * - If there are more than one busy CPU, idle load balancer may have | ||
3619 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
3620 | * SMT or core siblings and can run better if they move to different | ||
3621 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
3622 | * which will kick idle load balancer as soon as it has any load. | ||
3623 | */ | ||
3624 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
3625 | { | ||
3626 | unsigned long now = jiffies; | ||
3627 | int ret; | ||
3628 | int first_pick_cpu, second_pick_cpu; | ||
3629 | |||
3630 | if (time_before(now, nohz.next_balance)) | ||
3631 | return 0; | ||
3632 | |||
3633 | if (rq->idle_at_tick) | ||
3634 | return 0; | ||
3635 | |||
3636 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
3637 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
3638 | |||
3639 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
3640 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
3641 | return 0; | ||
3642 | |||
3643 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
3644 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3645 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3646 | if (rq->nr_running > 1) | ||
3647 | return 1; | ||
3648 | } else { | ||
3649 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
3650 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3651 | if (rq->nr_running) | ||
3652 | return 1; | ||
3653 | } | ||
3654 | } | ||
3655 | return 0; | ||
3656 | } | ||
3657 | #else | ||
3658 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
3659 | #endif | ||
3660 | |||
3661 | /* | ||
3662 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3663 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
3664 | */ | ||
3393 | static void run_rebalance_domains(struct softirq_action *h) | 3665 | static void run_rebalance_domains(struct softirq_action *h) |
3394 | { | 3666 | { |
3395 | int this_cpu = smp_processor_id(); | 3667 | int this_cpu = smp_processor_id(); |
@@ -3399,37 +3671,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3399 | 3671 | ||
3400 | rebalance_domains(this_cpu, idle); | 3672 | rebalance_domains(this_cpu, idle); |
3401 | 3673 | ||
3402 | #ifdef CONFIG_NO_HZ | ||
3403 | /* | 3674 | /* |
3404 | * If this cpu is the owner for idle load balancing, then do the | 3675 | * If this cpu has a pending nohz_balance_kick, then do the |
3405 | * balancing on behalf of the other idle cpus whose ticks are | 3676 | * balancing on behalf of the other idle cpus whose ticks are |
3406 | * stopped. | 3677 | * stopped. |
3407 | */ | 3678 | */ |
3408 | if (this_rq->idle_at_tick && | 3679 | nohz_idle_balance(this_cpu, idle); |
3409 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3410 | struct rq *rq; | ||
3411 | int balance_cpu; | ||
3412 | |||
3413 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3414 | if (balance_cpu == this_cpu) | ||
3415 | continue; | ||
3416 | |||
3417 | /* | ||
3418 | * If this cpu gets work to do, stop the load balancing | ||
3419 | * work being done for other cpus. Next load | ||
3420 | * balancing owner will pick it up. | ||
3421 | */ | ||
3422 | if (need_resched()) | ||
3423 | break; | ||
3424 | |||
3425 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3426 | |||
3427 | rq = cpu_rq(balance_cpu); | ||
3428 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3429 | this_rq->next_balance = rq->next_balance; | ||
3430 | } | ||
3431 | } | ||
3432 | #endif | ||
3433 | } | 3680 | } |
3434 | 3681 | ||
3435 | static inline int on_null_domain(int cpu) | 3682 | static inline int on_null_domain(int cpu) |
@@ -3439,57 +3686,17 @@ static inline int on_null_domain(int cpu) | |||
3439 | 3686 | ||
3440 | /* | 3687 | /* |
3441 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3688 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3442 | * | ||
3443 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3444 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3445 | * if the whole system is idle. | ||
3446 | */ | 3689 | */ |
3447 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3690 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3448 | { | 3691 | { |
3449 | #ifdef CONFIG_NO_HZ | ||
3450 | /* | ||
3451 | * If we were in the nohz mode recently and busy at the current | ||
3452 | * scheduler tick, then check if we need to nominate new idle | ||
3453 | * load balancer. | ||
3454 | */ | ||
3455 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3456 | rq->in_nohz_recently = 0; | ||
3457 | |||
3458 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3459 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3460 | atomic_set(&nohz.load_balancer, -1); | ||
3461 | } | ||
3462 | |||
3463 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3464 | int ilb = find_new_ilb(cpu); | ||
3465 | |||
3466 | if (ilb < nr_cpu_ids) | ||
3467 | resched_cpu(ilb); | ||
3468 | } | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * If this cpu is idle and doing idle load balancing for all the | ||
3473 | * cpus with ticks stopped, is it time for that to stop? | ||
3474 | */ | ||
3475 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3476 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3477 | resched_cpu(cpu); | ||
3478 | return; | ||
3479 | } | ||
3480 | |||
3481 | /* | ||
3482 | * If this cpu is idle and the idle load balancing is done by | ||
3483 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3484 | */ | ||
3485 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3486 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3487 | return; | ||
3488 | #endif | ||
3489 | /* Don't need to rebalance while attached to NULL domain */ | 3692 | /* Don't need to rebalance while attached to NULL domain */ |
3490 | if (time_after_eq(jiffies, rq->next_balance) && | 3693 | if (time_after_eq(jiffies, rq->next_balance) && |
3491 | likely(!on_null_domain(cpu))) | 3694 | likely(!on_null_domain(cpu))) |
3492 | raise_softirq(SCHED_SOFTIRQ); | 3695 | raise_softirq(SCHED_SOFTIRQ); |
3696 | #ifdef CONFIG_NO_HZ | ||
3697 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
3698 | nohz_balancer_kick(cpu); | ||
3699 | #endif | ||
3493 | } | 3700 | } |
3494 | 3701 | ||
3495 | static void rq_online_fair(struct rq *rq) | 3702 | static void rq_online_fair(struct rq *rq) |
@@ -3542,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p) | |||
3542 | 3749 | ||
3543 | raw_spin_lock_irqsave(&rq->lock, flags); | 3750 | raw_spin_lock_irqsave(&rq->lock, flags); |
3544 | 3751 | ||
3752 | update_rq_clock(rq); | ||
3753 | |||
3545 | if (unlikely(task_cpu(p) != this_cpu)) | 3754 | if (unlikely(task_cpu(p) != this_cpu)) |
3546 | __set_task_cpu(p, this_cpu); | 3755 | __set_task_cpu(p, this_cpu); |
3547 | 3756 | ||