diff options
Diffstat (limited to 'kernel/sched_fair.c')
| -rw-r--r-- | kernel/sched_fair.c | 81 |
1 files changed, 63 insertions, 18 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index db3f674ca49d..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | 25 | ||
| 26 | /* | 26 | /* |
| 27 | * Targeted preemption latency for CPU-bound tasks: | 27 | * Targeted preemption latency for CPU-bound tasks: |
| 28 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) | 28 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) |
| 29 | * | 29 | * |
| 30 | * NOTE: this latency value is not the same as the concept of | 30 | * NOTE: this latency value is not the same as the concept of |
| 31 | * 'timeslice length' - timeslices in CFS are of variable length | 31 | * 'timeslice length' - timeslices in CFS are of variable length |
| @@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
| 52 | 52 | ||
| 53 | /* | 53 | /* |
| 54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
| 55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 56 | */ | 56 | */ |
| 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
| 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
| @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 519 | static void update_curr(struct cfs_rq *cfs_rq) | 519 | static void update_curr(struct cfs_rq *cfs_rq) |
| 520 | { | 520 | { |
| 521 | struct sched_entity *curr = cfs_rq->curr; | 521 | struct sched_entity *curr = cfs_rq->curr; |
| 522 | u64 now = rq_of(cfs_rq)->clock; | 522 | u64 now = rq_of(cfs_rq)->clock_task; |
| 523 | unsigned long delta_exec; | 523 | unsigned long delta_exec; |
| 524 | 524 | ||
| 525 | if (unlikely(!curr)) | 525 | if (unlikely(!curr)) |
| @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 602 | /* | 602 | /* |
| 603 | * We are starting a new run period: | 603 | * We are starting a new run period: |
| 604 | */ | 604 | */ |
| 605 | se->exec_start = rq_of(cfs_rq)->clock; | 605 | se->exec_start = rq_of(cfs_rq)->clock_task; |
| 606 | } | 606 | } |
| 607 | 607 | ||
| 608 | /************************************************** | 608 | /************************************************** |
| @@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 1764 | set_task_cpu(p, this_cpu); | 1764 | set_task_cpu(p, this_cpu); |
| 1765 | activate_task(this_rq, p, 0); | 1765 | activate_task(this_rq, p, 0); |
| 1766 | check_preempt_curr(this_rq, p, 0); | 1766 | check_preempt_curr(this_rq, p, 0); |
| 1767 | |||
| 1768 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
| 1769 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
| 1770 | this_rq->idle_stamp = 0; | ||
| 1767 | } | 1771 | } |
| 1768 | 1772 | ||
| 1769 | /* | 1773 | /* |
| @@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 1798 | * 2) too many balance attempts have failed. | 1802 | * 2) too many balance attempts have failed. |
| 1799 | */ | 1803 | */ |
| 1800 | 1804 | ||
| 1801 | tsk_cache_hot = task_hot(p, rq->clock, sd); | 1805 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); |
| 1802 | if (!tsk_cache_hot || | 1806 | if (!tsk_cache_hot || |
| 1803 | sd->nr_balance_failed > sd->cache_nice_tries) { | 1807 | sd->nr_balance_failed > sd->cache_nice_tries) { |
| 1804 | #ifdef CONFIG_SCHEDSTATS | 1808 | #ifdef CONFIG_SCHEDSTATS |
| @@ -2030,12 +2034,14 @@ struct sd_lb_stats { | |||
| 2030 | unsigned long this_load; | 2034 | unsigned long this_load; |
| 2031 | unsigned long this_load_per_task; | 2035 | unsigned long this_load_per_task; |
| 2032 | unsigned long this_nr_running; | 2036 | unsigned long this_nr_running; |
| 2037 | unsigned long this_has_capacity; | ||
| 2033 | 2038 | ||
| 2034 | /* Statistics of the busiest group */ | 2039 | /* Statistics of the busiest group */ |
| 2035 | unsigned long max_load; | 2040 | unsigned long max_load; |
| 2036 | unsigned long busiest_load_per_task; | 2041 | unsigned long busiest_load_per_task; |
| 2037 | unsigned long busiest_nr_running; | 2042 | unsigned long busiest_nr_running; |
| 2038 | unsigned long busiest_group_capacity; | 2043 | unsigned long busiest_group_capacity; |
| 2044 | unsigned long busiest_has_capacity; | ||
| 2039 | 2045 | ||
| 2040 | int group_imb; /* Is there imbalance in this sd */ | 2046 | int group_imb; /* Is there imbalance in this sd */ |
| 2041 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -2058,6 +2064,7 @@ struct sg_lb_stats { | |||
| 2058 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 2059 | unsigned long group_capacity; | 2065 | unsigned long group_capacity; |
| 2060 | int group_imb; /* Is there an imbalance in the group ? */ | 2066 | int group_imb; /* Is there an imbalance in the group ? */ |
| 2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | ||
| 2061 | }; | 2068 | }; |
| 2062 | 2069 | ||
| 2063 | /** | 2070 | /** |
| @@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu) | |||
| 2268 | u64 total, available; | 2275 | u64 total, available; |
| 2269 | 2276 | ||
| 2270 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2277 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
| 2271 | available = total - rq->rt_avg; | 2278 | |
| 2279 | if (unlikely(total < rq->rt_avg)) { | ||
| 2280 | /* Ensures that power won't end up being negative */ | ||
| 2281 | available = 0; | ||
| 2282 | } else { | ||
| 2283 | available = total - rq->rt_avg; | ||
| 2284 | } | ||
| 2272 | 2285 | ||
| 2273 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2286 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) |
| 2274 | total = SCHED_LOAD_SCALE; | 2287 | total = SCHED_LOAD_SCALE; |
| @@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2378 | int local_group, const struct cpumask *cpus, | 2391 | int local_group, const struct cpumask *cpus, |
| 2379 | int *balance, struct sg_lb_stats *sgs) | 2392 | int *balance, struct sg_lb_stats *sgs) |
| 2380 | { | 2393 | { |
| 2381 | unsigned long load, max_cpu_load, min_cpu_load; | 2394 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; |
| 2382 | int i; | 2395 | int i; |
| 2383 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2396 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
| 2384 | unsigned long avg_load_per_task = 0; | 2397 | unsigned long avg_load_per_task = 0; |
| @@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2389 | /* Tally up the load of all CPUs in the group */ | 2402 | /* Tally up the load of all CPUs in the group */ |
| 2390 | max_cpu_load = 0; | 2403 | max_cpu_load = 0; |
| 2391 | min_cpu_load = ~0UL; | 2404 | min_cpu_load = ~0UL; |
| 2405 | max_nr_running = 0; | ||
| 2392 | 2406 | ||
| 2393 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2407 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
| 2394 | struct rq *rq = cpu_rq(i); | 2408 | struct rq *rq = cpu_rq(i); |
| @@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2406 | load = target_load(i, load_idx); | 2420 | load = target_load(i, load_idx); |
| 2407 | } else { | 2421 | } else { |
| 2408 | load = source_load(i, load_idx); | 2422 | load = source_load(i, load_idx); |
| 2409 | if (load > max_cpu_load) | 2423 | if (load > max_cpu_load) { |
| 2410 | max_cpu_load = load; | 2424 | max_cpu_load = load; |
| 2425 | max_nr_running = rq->nr_running; | ||
| 2426 | } | ||
| 2411 | if (min_cpu_load > load) | 2427 | if (min_cpu_load > load) |
| 2412 | min_cpu_load = load; | 2428 | min_cpu_load = load; |
| 2413 | } | 2429 | } |
| @@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2447 | if (sgs->sum_nr_running) | 2463 | if (sgs->sum_nr_running) |
| 2448 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2464 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 2449 | 2465 | ||
| 2450 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2466 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) |
| 2451 | sgs->group_imb = 1; | 2467 | sgs->group_imb = 1; |
| 2452 | 2468 | ||
| 2453 | sgs->group_capacity = | 2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| 2454 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
| 2455 | if (!sgs->group_capacity) | 2470 | if (!sgs->group_capacity) |
| 2456 | sgs->group_capacity = fix_small_capacity(sd, group); | 2471 | sgs->group_capacity = fix_small_capacity(sd, group); |
| 2472 | |||
| 2473 | if (sgs->group_capacity > sgs->sum_nr_running) | ||
| 2474 | sgs->group_has_capacity = 1; | ||
| 2457 | } | 2475 | } |
| 2458 | 2476 | ||
| 2459 | /** | 2477 | /** |
| @@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2542 | /* | 2560 | /* |
| 2543 | * In case the child domain prefers tasks go to siblings | 2561 | * In case the child domain prefers tasks go to siblings |
| 2544 | * first, lower the sg capacity to one so that we'll try | 2562 | * first, lower the sg capacity to one so that we'll try |
| 2545 | * and move all the excess tasks away. | 2563 | * and move all the excess tasks away. We lower the capacity |
| 2564 | * of a group only if the local group has the capacity to fit | ||
| 2565 | * these excess tasks, i.e. nr_running < group_capacity. The | ||
| 2566 | * extra check prevents the case where you always pull from the | ||
| 2567 | * heaviest group when it is already under-utilized (possible | ||
| 2568 | * with a large weight task outweighs the tasks on the system). | ||
| 2546 | */ | 2569 | */ |
| 2547 | if (prefer_sibling) | 2570 | if (prefer_sibling && !local_group && sds->this_has_capacity) |
| 2548 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 2571 | sgs.group_capacity = min(sgs.group_capacity, 1UL); |
| 2549 | 2572 | ||
| 2550 | if (local_group) { | 2573 | if (local_group) { |
| @@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2552 | sds->this = sg; | 2575 | sds->this = sg; |
| 2553 | sds->this_nr_running = sgs.sum_nr_running; | 2576 | sds->this_nr_running = sgs.sum_nr_running; |
| 2554 | sds->this_load_per_task = sgs.sum_weighted_load; | 2577 | sds->this_load_per_task = sgs.sum_weighted_load; |
| 2578 | sds->this_has_capacity = sgs.group_has_capacity; | ||
| 2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
| 2556 | sds->max_load = sgs.avg_load; | 2580 | sds->max_load = sgs.avg_load; |
| 2557 | sds->busiest = sg; | 2581 | sds->busiest = sg; |
| 2558 | sds->busiest_nr_running = sgs.sum_nr_running; | 2582 | sds->busiest_nr_running = sgs.sum_nr_running; |
| 2559 | sds->busiest_group_capacity = sgs.group_capacity; | 2583 | sds->busiest_group_capacity = sgs.group_capacity; |
| 2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
| 2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
| 2561 | sds->group_imb = sgs.group_imb; | 2586 | sds->group_imb = sgs.group_imb; |
| 2562 | } | 2587 | } |
| 2563 | 2588 | ||
| @@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 2754 | return fix_small_imbalance(sds, this_cpu, imbalance); | 2779 | return fix_small_imbalance(sds, this_cpu, imbalance); |
| 2755 | 2780 | ||
| 2756 | } | 2781 | } |
| 2782 | |||
| 2757 | /******* find_busiest_group() helpers end here *********************/ | 2783 | /******* find_busiest_group() helpers end here *********************/ |
| 2758 | 2784 | ||
| 2759 | /** | 2785 | /** |
| @@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2805 | * 4) This group is more busy than the avg busieness at this | 2831 | * 4) This group is more busy than the avg busieness at this |
| 2806 | * sched_domain. | 2832 | * sched_domain. |
| 2807 | * 5) The imbalance is within the specified limit. | 2833 | * 5) The imbalance is within the specified limit. |
| 2834 | * | ||
| 2835 | * Note: when doing newidle balance, if the local group has excess | ||
| 2836 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
| 2837 | * does not have any capacity, we force a load balance to pull tasks | ||
| 2838 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
| 2808 | */ | 2839 | */ |
| 2809 | if (!(*balance)) | 2840 | if (!(*balance)) |
| 2810 | goto ret; | 2841 | goto ret; |
| @@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2816 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2847 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 2817 | goto out_balanced; | 2848 | goto out_balanced; |
| 2818 | 2849 | ||
| 2850 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
| 2851 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | ||
| 2852 | !sds.busiest_has_capacity) | ||
| 2853 | goto force_balance; | ||
| 2854 | |||
| 2819 | if (sds.this_load >= sds.max_load) | 2855 | if (sds.this_load >= sds.max_load) |
| 2820 | goto out_balanced; | 2856 | goto out_balanced; |
| 2821 | 2857 | ||
| @@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2827 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
| 2828 | goto out_balanced; | 2864 | goto out_balanced; |
| 2829 | 2865 | ||
| 2866 | force_balance: | ||
| 2830 | /* Looks like there is an imbalance. Compute it */ | 2867 | /* Looks like there is an imbalance. Compute it */ |
| 2831 | calculate_imbalance(&sds, this_cpu, imbalance); | 2868 | calculate_imbalance(&sds, this_cpu, imbalance); |
| 2832 | return sds.busiest; | 2869 | return sds.busiest; |
| @@ -3031,7 +3068,14 @@ redo: | |||
| 3031 | 3068 | ||
| 3032 | if (!ld_moved) { | 3069 | if (!ld_moved) { |
| 3033 | schedstat_inc(sd, lb_failed[idle]); | 3070 | schedstat_inc(sd, lb_failed[idle]); |
| 3034 | sd->nr_balance_failed++; | 3071 | /* |
| 3072 | * Increment the failure counter only on periodic balance. | ||
| 3073 | * We do not want newidle balance, which can be very | ||
| 3074 | * frequent, pollute the failure counter causing | ||
| 3075 | * excessive cache_hot migrations and active balances. | ||
| 3076 | */ | ||
| 3077 | if (idle != CPU_NEWLY_IDLE) | ||
| 3078 | sd->nr_balance_failed++; | ||
| 3035 | 3079 | ||
| 3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3080 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
| 3037 | this_cpu)) { | 3081 | this_cpu)) { |
| @@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3153 | interval = msecs_to_jiffies(sd->balance_interval); | 3197 | interval = msecs_to_jiffies(sd->balance_interval); |
| 3154 | if (time_after(next_balance, sd->last_balance + interval)) | 3198 | if (time_after(next_balance, sd->last_balance + interval)) |
| 3155 | next_balance = sd->last_balance + interval; | 3199 | next_balance = sd->last_balance + interval; |
| 3156 | if (pulled_task) { | 3200 | if (pulled_task) |
| 3157 | this_rq->idle_stamp = 0; | ||
| 3158 | break; | 3201 | break; |
| 3159 | } | ||
| 3160 | } | 3202 | } |
| 3161 | 3203 | ||
| 3162 | raw_spin_lock(&this_rq->lock); | 3204 | raw_spin_lock(&this_rq->lock); |
| @@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) | |||
| 3751 | 3793 | ||
| 3752 | update_rq_clock(rq); | 3794 | update_rq_clock(rq); |
| 3753 | 3795 | ||
| 3754 | if (unlikely(task_cpu(p) != this_cpu)) | 3796 | if (unlikely(task_cpu(p) != this_cpu)) { |
| 3797 | rcu_read_lock(); | ||
| 3755 | __set_task_cpu(p, this_cpu); | 3798 | __set_task_cpu(p, this_cpu); |
| 3799 | rcu_read_unlock(); | ||
| 3800 | } | ||
| 3756 | 3801 | ||
| 3757 | update_curr(cfs_rq); | 3802 | update_curr(cfs_rq); |
| 3758 | 3803 | ||
