diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 437 |
1 files changed, 271 insertions, 166 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
| 670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 671 | 671 | ||
| 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
| 673 | 674 | ||
| 674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
| 675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
| @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
| 677 | u32 slice; | 678 | u32 slice; |
| 678 | 679 | ||
| 679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
| 680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
| 681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
| 682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
| 684 | __update_task_entity_utilization(&p->se); | ||
| 683 | } | 685 | } |
| 684 | #else | 686 | #else |
| 685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
| @@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
| 1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
| 1198 | { | 1200 | { |
| 1199 | long imb, old_imb; | ||
| 1200 | long orig_src_load, orig_dst_load; | ||
| 1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
| 1202 | long orig_src_load; | ||
| 1203 | long load_a, load_b; | ||
| 1204 | long moved_load; | ||
| 1205 | long imb; | ||
| 1202 | 1206 | ||
| 1203 | /* | 1207 | /* |
| 1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
| @@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
| 1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
| 1212 | 1216 | ||
| 1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
| 1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
| 1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
| 1220 | if (load_a < load_b) | ||
| 1221 | swap(load_a, load_b); | ||
| 1216 | 1222 | ||
| 1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
| 1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
| 1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
| 1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
| 1221 | return false; | 1227 | return false; |
| 1222 | 1228 | ||
| 1223 | /* | 1229 | /* |
| 1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
| 1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
| 1232 | * without moving things past the point of balance. | ||
| 1226 | */ | 1233 | */ |
| 1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
| 1228 | orig_dst_load = env->dst_stats.load; | ||
| 1229 | 1235 | ||
| 1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
| 1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
| 1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
| 1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
| 1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
| 1241 | * situation, without crossing over the balance point. | ||
| 1242 | */ | ||
| 1243 | moved_load = orig_src_load - src_load; | ||
| 1235 | 1244 | ||
| 1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
| 1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
| 1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
| 1248 | else | ||
| 1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
| 1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
| 1238 | } | 1251 | } |
| 1239 | 1252 | ||
| 1240 | /* | 1253 | /* |
| @@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1609 | /* | 1622 | /* |
| 1610 | * If there were no record hinting faults then either the task is | 1623 | * If there were no record hinting faults then either the task is |
| 1611 | * completely idle or all activity is areas that are not of interest | 1624 | * completely idle or all activity is areas that are not of interest |
| 1612 | * to automatic numa balancing. Scan slower | 1625 | * to automatic numa balancing. Related to that, if there were failed |
| 1626 | * migration then it implies we are migrating too quickly or the local | ||
| 1627 | * node is overloaded. In either case, scan slower | ||
| 1613 | */ | 1628 | */ |
| 1614 | if (local + shared == 0) { | 1629 | if (local + shared == 0 || p->numa_faults_locality[2]) { |
| 1615 | p->numa_scan_period = min(p->numa_scan_period_max, | 1630 | p->numa_scan_period = min(p->numa_scan_period_max, |
| 1616 | p->numa_scan_period << 1); | 1631 | p->numa_scan_period << 1); |
| 1617 | 1632 | ||
| @@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1673 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
| 1674 | } else { | 1689 | } else { |
| 1675 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
| 1676 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
| 1677 | } | 1692 | } |
| 1678 | 1693 | ||
| 1679 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
| @@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
| 1763 | } | 1778 | } |
| 1764 | } | 1779 | } |
| 1765 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
| 1781 | if (!max_faults) | ||
| 1782 | break; | ||
| 1766 | nodes = max_group; | 1783 | nodes = max_group; |
| 1767 | } | 1784 | } |
| 1768 | return nid; | 1785 | return nid; |
| @@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2080 | 2097 | ||
| 2081 | if (migrated) | 2098 | if (migrated) |
| 2082 | p->numa_pages_migrated += pages; | 2099 | p->numa_pages_migrated += pages; |
| 2100 | if (flags & TNF_MIGRATE_FAIL) | ||
| 2101 | p->numa_faults_locality[2] += pages; | ||
| 2083 | 2102 | ||
| 2084 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; | 2103 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
| 2085 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; | 2104 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
| @@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work) | |||
| 2161 | vma = mm->mmap; | 2180 | vma = mm->mmap; |
| 2162 | } | 2181 | } |
| 2163 | for (; vma; vma = vma->vm_next) { | 2182 | for (; vma; vma = vma->vm_next) { |
| 2164 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) | 2183 | if (!vma_migratable(vma) || !vma_policy_mof(vma) || |
| 2184 | is_vm_hugetlb_page(vma)) { | ||
| 2165 | continue; | 2185 | continue; |
| 2186 | } | ||
| 2166 | 2187 | ||
| 2167 | /* | 2188 | /* |
| 2168 | * Shared library pages mapped by multiple processes are not | 2189 | * Shared library pages mapped by multiple processes are not |
| @@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
| 2497 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
| 2498 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
| 2499 | */ | 2520 | */ |
| 2500 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
| 2501 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
| 2502 | int runnable) | 2523 | int runnable, |
| 2524 | int running) | ||
| 2503 | { | 2525 | { |
| 2504 | u64 delta, periods; | 2526 | u64 delta, periods; |
| 2505 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
| 2506 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
| 2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
| 2507 | 2530 | ||
| 2508 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
| 2509 | /* | 2532 | /* |
| @@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2525 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
| 2526 | 2549 | ||
| 2527 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
| 2528 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
| 2529 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
| 2530 | /* period roll-over */ | 2553 | /* period roll-over */ |
| 2531 | decayed = 1; | 2554 | decayed = 1; |
| @@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2538 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
| 2539 | if (runnable) | 2562 | if (runnable) |
| 2540 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
| 2541 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
| 2565 | sa->running_avg_sum += delta_w * scale_freq | ||
| 2566 | >> SCHED_CAPACITY_SHIFT; | ||
| 2567 | sa->avg_period += delta_w; | ||
| 2542 | 2568 | ||
| 2543 | delta -= delta_w; | 2569 | delta -= delta_w; |
| 2544 | 2570 | ||
| @@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2548 | 2574 | ||
| 2549 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
| 2550 | periods + 1); | 2576 | periods + 1); |
| 2551 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
| 2578 | periods + 1); | ||
| 2579 | sa->avg_period = decay_load(sa->avg_period, | ||
| 2552 | periods + 1); | 2580 | periods + 1); |
| 2553 | 2581 | ||
| 2554 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
| 2555 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
| 2556 | if (runnable) | 2584 | if (runnable) |
| 2557 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
| 2558 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
| 2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
| 2588 | >> SCHED_CAPACITY_SHIFT; | ||
| 2589 | sa->avg_period += runnable_contrib; | ||
| 2559 | } | 2590 | } |
| 2560 | 2591 | ||
| 2561 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
| 2562 | if (runnable) | 2593 | if (runnable) |
| 2563 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
| 2564 | sa->runnable_avg_period += delta; | 2595 | if (running) |
| 2596 | sa->running_avg_sum += delta * scale_freq | ||
| 2597 | >> SCHED_CAPACITY_SHIFT; | ||
| 2598 | sa->avg_period += delta; | ||
| 2565 | 2599 | ||
| 2566 | return decayed; | 2600 | return decayed; |
| 2567 | } | 2601 | } |
| @@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
| 2578 | return 0; | 2612 | return 0; |
| 2579 | 2613 | ||
| 2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
| 2615 | se->avg.utilization_avg_contrib = | ||
| 2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
| 2581 | 2617 | ||
| 2582 | return decays; | 2618 | return decays; |
| 2583 | } | 2619 | } |
| @@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
| 2613 | 2649 | ||
| 2614 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
| 2615 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
| 2616 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
| 2617 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
| 2618 | 2654 | ||
| 2619 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
| @@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
| 2666 | 2702 | ||
| 2667 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
| 2668 | { | 2704 | { |
| 2669 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
| 2706 | runnable, runnable); | ||
| 2670 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
| 2671 | } | 2708 | } |
| 2672 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| @@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
| 2684 | 2721 | ||
| 2685 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
| 2686 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
| 2687 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
| 2688 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
| 2689 | } | 2726 | } |
| 2690 | 2727 | ||
| @@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
| 2703 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
| 2704 | } | 2741 | } |
| 2705 | 2742 | ||
| 2743 | |||
| 2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
| 2745 | { | ||
| 2746 | u32 contrib; | ||
| 2747 | |||
| 2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
| 2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
| 2750 | contrib /= (se->avg.avg_period + 1); | ||
| 2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
| 2752 | } | ||
| 2753 | |||
| 2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
| 2755 | { | ||
| 2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
| 2757 | |||
| 2758 | if (entity_is_task(se)) | ||
| 2759 | __update_task_entity_utilization(se); | ||
| 2760 | else | ||
| 2761 | se->avg.utilization_avg_contrib = | ||
| 2762 | group_cfs_rq(se)->utilization_load_avg; | ||
| 2763 | |||
| 2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
| 2765 | } | ||
| 2766 | |||
| 2706 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
| 2707 | long load_contrib) | 2768 | long load_contrib) |
| 2708 | { | 2769 | { |
| @@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2719 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
| 2720 | { | 2781 | { |
| 2721 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2722 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
| 2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2723 | u64 now; | 2785 | u64 now; |
| 2724 | 2786 | ||
| 2725 | /* | 2787 | /* |
| @@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2731 | else | 2793 | else |
| 2732 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
| 2733 | 2795 | ||
| 2734 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
| 2797 | cfs_rq->curr == se)) | ||
| 2735 | return; | 2798 | return; |
| 2736 | 2799 | ||
| 2737 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
| 2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
| 2738 | 2802 | ||
| 2739 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
| 2740 | return; | 2804 | return; |
| 2741 | 2805 | ||
| 2742 | if (se->on_rq) | 2806 | if (se->on_rq) { |
| 2743 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
| 2744 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
| 2809 | } else { | ||
| 2745 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
| 2811 | } | ||
| 2746 | } | 2812 | } |
| 2747 | 2813 | ||
| 2748 | /* | 2814 | /* |
| @@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2817 | } | 2883 | } |
| 2818 | 2884 | ||
| 2819 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
| 2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
| 2820 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
| 2821 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
| 2822 | } | 2889 | } |
| @@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2835 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
| 2836 | 2903 | ||
| 2837 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
| 2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
| 2838 | if (sleep) { | 2906 | if (sleep) { |
| 2839 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
| 2840 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
| @@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3172 | */ | 3240 | */ |
| 3173 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
| 3174 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
| 3243 | update_entity_load_avg(se, 1); | ||
| 3175 | } | 3244 | } |
| 3176 | 3245 | ||
| 3177 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
| @@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
| 4298 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
| 4299 | } | 4368 | } |
| 4300 | 4369 | ||
| 4370 | static unsigned long capacity_orig_of(int cpu) | ||
| 4371 | { | ||
| 4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 4373 | } | ||
| 4374 | |||
| 4301 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4302 | { | 4376 | { |
| 4303 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
| @@ -4711,6 +4785,33 @@ next: | |||
| 4711 | done: | 4785 | done: |
| 4712 | return target; | 4786 | return target; |
| 4713 | } | 4787 | } |
| 4788 | /* | ||
| 4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
| 4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
| 4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
| 4792 | * task (ie cpu_capacity). | ||
| 4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
| 4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
| 4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
| 4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
| 4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
| 4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
| 4799 | * after migrating tasks until the average stabilizes with the new running | ||
| 4800 | * time. So we need to check that the usage stays into the range | ||
| 4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
| 4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
| 4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
| 4804 | */ | ||
| 4805 | static int get_cpu_usage(int cpu) | ||
| 4806 | { | ||
| 4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
| 4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
| 4809 | |||
| 4810 | if (usage >= SCHED_LOAD_SCALE) | ||
| 4811 | return capacity; | ||
| 4812 | |||
| 4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
| 4814 | } | ||
| 4714 | 4815 | ||
| 4715 | /* | 4816 | /* |
| 4716 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
| @@ -5837,12 +5938,12 @@ struct sg_lb_stats { | |||
| 5837 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 5838 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
| 5839 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
| 5941 | unsigned long group_usage; /* Total usage of the group */ | ||
| 5840 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 5841 | unsigned int group_capacity_factor; | ||
| 5842 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
| 5843 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
| 5844 | enum group_type group_type; | 5945 | enum group_type group_type; |
| 5845 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
| 5846 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
| 5847 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
| 5848 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
| @@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
| 5913 | return load_idx; | 6014 | return load_idx; |
| 5914 | } | 6015 | } |
| 5915 | 6016 | ||
| 5916 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
| 5917 | { | ||
| 5918 | return SCHED_CAPACITY_SCALE; | ||
| 5919 | } | ||
| 5920 | |||
| 5921 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 5922 | { | ||
| 5923 | return default_scale_capacity(sd, cpu); | ||
| 5924 | } | ||
| 5925 | |||
| 5926 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5927 | { | 6018 | { |
| 5928 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| @@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5939 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
| 5940 | { | 6031 | { |
| 5941 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
| 5942 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
| 5943 | s64 delta; | 6034 | s64 delta; |
| 5944 | 6035 | ||
| 5945 | /* | 6036 | /* |
| @@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5955 | 6046 | ||
| 5956 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
| 5957 | 6048 | ||
| 5958 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
| 5959 | /* Ensures that capacity won't end up being negative */ | ||
| 5960 | available = 0; | ||
| 5961 | } else { | ||
| 5962 | available = total - avg; | ||
| 5963 | } | ||
| 5964 | |||
| 5965 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | ||
| 5966 | total = SCHED_CAPACITY_SCALE; | ||
| 5967 | 6050 | ||
| 5968 | total >>= SCHED_CAPACITY_SHIFT; | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
| 6052 | return SCHED_CAPACITY_SCALE - used; | ||
| 5969 | 6053 | ||
| 5970 | return div_u64(available, total); | 6054 | return 1; |
| 5971 | } | 6055 | } |
| 5972 | 6056 | ||
| 5973 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| @@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5982 | 6066 | ||
| 5983 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5984 | 6068 | ||
| 5985 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
| 5986 | |||
| 5987 | if (sched_feat(ARCH_CAPACITY)) | ||
| 5988 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
| 5989 | else | ||
| 5990 | capacity *= default_scale_capacity(sd, cpu); | ||
| 5991 | |||
| 5992 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
| 5993 | 6070 | ||
| 5994 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
| 5995 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
| @@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6005 | { | 6082 | { |
| 6006 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
| 6007 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
| 6008 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
| 6009 | unsigned long interval; | 6086 | unsigned long interval; |
| 6010 | 6087 | ||
| 6011 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6017 | return; | 6094 | return; |
| 6018 | } | 6095 | } |
| 6019 | 6096 | ||
| 6020 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
| 6021 | 6098 | ||
| 6022 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
| 6023 | /* | 6100 | /* |
| @@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6037 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
| 6038 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
| 6039 | * | 6116 | * |
| 6040 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
| 6041 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
| 6042 | * | ||
| 6043 | * Runtime updates will correct capacity_orig. | ||
| 6044 | */ | 6119 | */ |
| 6045 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
| 6046 | capacity_orig += capacity_of(cpu); | ||
| 6047 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
| 6048 | continue; | 6122 | continue; |
| 6049 | } | 6123 | } |
| 6050 | 6124 | ||
| 6051 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
| 6052 | capacity_orig += sgc->capacity_orig; | ||
| 6053 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
| 6054 | } | 6127 | } |
| 6055 | } else { | 6128 | } else { |
| @@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6060 | 6133 | ||
| 6061 | group = child->groups; | 6134 | group = child->groups; |
| 6062 | do { | 6135 | do { |
| 6063 | capacity_orig += group->sgc->capacity_orig; | ||
| 6064 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
| 6065 | group = group->next; | 6137 | group = group->next; |
| 6066 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
| 6067 | } | 6139 | } |
| 6068 | 6140 | ||
| 6069 | sdg->sgc->capacity_orig = capacity_orig; | ||
| 6070 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
| 6071 | } | 6142 | } |
| 6072 | 6143 | ||
| 6073 | /* | 6144 | /* |
| 6074 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
| 6075 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
| 6076 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
| 6077 | * | ||
| 6078 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
| 6079 | */ | 6148 | */ |
| 6080 | static inline int | 6149 | static inline int |
| 6081 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
| 6082 | { | 6151 | { |
| 6083 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
| 6084 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
| 6085 | */ | ||
| 6086 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
| 6087 | return 0; | ||
| 6088 | |||
| 6089 | /* | ||
| 6090 | * If ~90% of the cpu_capacity is still there, we're good. | ||
| 6091 | */ | ||
| 6092 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
| 6093 | return 1; | ||
| 6094 | |||
| 6095 | return 0; | ||
| 6096 | } | 6154 | } |
| 6097 | 6155 | ||
| 6098 | /* | 6156 | /* |
| @@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
| 6130 | } | 6188 | } |
| 6131 | 6189 | ||
| 6132 | /* | 6190 | /* |
| 6133 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
| 6134 | * | 6192 | * be used by some tasks. |
| 6135 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
| 6136 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
| 6137 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
| 6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
| 6197 | * account the variance of the tasks' load and to return true if the available | ||
| 6198 | * capacity in meaningful for the load balancer. | ||
| 6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
| 6200 | * any benefit for the load balance. | ||
| 6138 | */ | 6201 | */ |
| 6139 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
| 6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6140 | { | 6204 | { |
| 6141 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
| 6142 | unsigned int capacity, capacity_orig; | 6206 | return true; |
| 6143 | 6207 | ||
| 6144 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
| 6145 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6146 | cpus = group->group_weight; | 6210 | return true; |
| 6211 | |||
| 6212 | return false; | ||
| 6213 | } | ||
| 6147 | 6214 | ||
| 6148 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6215 | /* |
| 6149 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6216 | * group_is_overloaded returns true if the group has more tasks than it can |
| 6150 | capacity_factor = cpus / smt; /* cores */ | 6217 | * handle. |
| 6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
| 6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
| 6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
| 6221 | * false. | ||
| 6222 | */ | ||
| 6223 | static inline bool | ||
| 6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6225 | { | ||
| 6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
| 6227 | return false; | ||
| 6151 | 6228 | ||
| 6152 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
| 6153 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6154 | if (!capacity_factor) | 6231 | return true; |
| 6155 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6156 | 6232 | ||
| 6157 | return capacity_factor; | 6233 | return false; |
| 6158 | } | 6234 | } |
| 6159 | 6235 | ||
| 6160 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
| 6161 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
| 6238 | struct sg_lb_stats *sgs) | ||
| 6162 | { | 6239 | { |
| 6163 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
| 6164 | return group_overloaded; | 6241 | return group_overloaded; |
| 6165 | 6242 | ||
| 6166 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
| @@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6198 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
| 6199 | 6276 | ||
| 6200 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
| 6278 | sgs->group_usage += get_cpu_usage(i); | ||
| 6201 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 6202 | 6280 | ||
| 6203 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
| @@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6220 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 6221 | 6299 | ||
| 6222 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
| 6223 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
| 6224 | sgs->group_type = group_classify(group, sgs); | ||
| 6225 | 6301 | ||
| 6226 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
| 6227 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
| 6228 | } | 6304 | } |
| 6229 | 6305 | ||
| 6230 | /** | 6306 | /** |
| @@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6346 | 6422 | ||
| 6347 | /* | 6423 | /* |
| 6348 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
| 6349 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
| 6350 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
| 6351 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
| 6352 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
| 6353 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
| 6354 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
| 6355 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
| 6356 | */ | 6432 | */ |
| 6357 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
| 6358 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
| 6359 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
| 6360 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
| 6437 | sgs->group_type = group_overloaded; | ||
| 6361 | } | 6438 | } |
| 6362 | 6439 | ||
| 6363 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| @@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6537 | */ | 6614 | */ |
| 6538 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
| 6539 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
| 6540 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
| 6541 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
| 6542 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
| 6543 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
| 6544 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
| 6622 | load_above_capacity = ~0UL; | ||
| 6545 | } | 6623 | } |
| 6546 | 6624 | ||
| 6547 | /* | 6625 | /* |
| @@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6604 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
| 6605 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
| 6606 | 6684 | ||
| 6685 | /* ASYM feature bypasses nice load balance check */ | ||
| 6607 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
| 6608 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
| 6609 | return sds.busiest; | 6688 | return sds.busiest; |
| @@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6624 | goto force_balance; | 6703 | goto force_balance; |
| 6625 | 6704 | ||
| 6626 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| 6627 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
| 6628 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
| 6629 | goto force_balance; | 6708 | goto force_balance; |
| 6630 | 6709 | ||
| 6631 | /* | 6710 | /* |
| @@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6684 | int i; | 6763 | int i; |
| 6685 | 6764 | ||
| 6686 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 6687 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
| 6688 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
| 6689 | 6768 | ||
| 6690 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
| @@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6713 | continue; | 6792 | continue; |
| 6714 | 6793 | ||
| 6715 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
| 6716 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
| 6717 | if (!capacity_factor) | ||
| 6718 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6719 | 6795 | ||
| 6720 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
| 6721 | 6797 | ||
| @@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6723 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
| 6724 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
| 6725 | */ | 6801 | */ |
| 6726 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
| 6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
| 6804 | !check_cpu_capacity(rq, env->sd)) | ||
| 6727 | continue; | 6805 | continue; |
| 6728 | 6806 | ||
| 6729 | /* | 6807 | /* |
| @@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
| 6771 | return 1; | 6849 | return 1; |
| 6772 | } | 6850 | } |
| 6773 | 6851 | ||
| 6852 | /* | ||
| 6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
| 6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
| 6855 | * because of other sched_class or IRQs if more capacity stays | ||
| 6856 | * available on dst_cpu. | ||
| 6857 | */ | ||
| 6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
| 6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
| 6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
| 6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
| 6862 | return 1; | ||
| 6863 | } | ||
| 6864 | |||
| 6774 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
| 6775 | } | 6866 | } |
| 6776 | 6867 | ||
| @@ -6870,6 +6961,9 @@ redo: | |||
| 6870 | 6961 | ||
| 6871 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
| 6872 | 6963 | ||
| 6964 | env.src_cpu = busiest->cpu; | ||
| 6965 | env.src_rq = busiest; | ||
| 6966 | |||
| 6873 | ld_moved = 0; | 6967 | ld_moved = 0; |
| 6874 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
| 6875 | /* | 6969 | /* |
| @@ -6879,8 +6973,6 @@ redo: | |||
| 6879 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
| 6880 | */ | 6974 | */ |
| 6881 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
| 6882 | env.src_cpu = busiest->cpu; | ||
| 6883 | env.src_rq = busiest; | ||
| 6884 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6885 | 6977 | ||
| 6886 | more_balance: | 6978 | more_balance: |
| @@ -7580,22 +7672,25 @@ end: | |||
| 7580 | 7672 | ||
| 7581 | /* | 7673 | /* |
| 7582 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
| 7583 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
| 7584 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
| 7585 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
| 7586 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
| 7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
| 7680 | * multiple busy cpu. | ||
| 7587 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 7588 | * domain span are idle. | 7682 | * domain span are idle. |
| 7589 | */ | 7683 | */ |
| 7590 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
| 7591 | { | 7685 | { |
| 7592 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
| 7593 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
| 7594 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
| 7595 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
| 7690 | bool kick = false; | ||
| 7596 | 7691 | ||
| 7597 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
| 7598 | return 0; | 7693 | return false; |
| 7599 | 7694 | ||
| 7600 | /* | 7695 | /* |
| 7601 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
| @@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
| 7609 | * balancing. | 7704 | * balancing. |
| 7610 | */ | 7705 | */ |
| 7611 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
| 7612 | return 0; | 7707 | return false; |
| 7613 | 7708 | ||
| 7614 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
| 7615 | return 0; | 7710 | return false; |
| 7616 | 7711 | ||
| 7617 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
| 7618 | goto need_kick; | 7713 | return true; |
| 7619 | 7714 | ||
| 7620 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
| 7621 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 7622 | |||
| 7623 | if (sd) { | 7717 | if (sd) { |
| 7624 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
| 7625 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
| 7626 | 7720 | ||
| 7627 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
| 7628 | goto need_kick_unlock; | 7722 | kick = true; |
| 7723 | goto unlock; | ||
| 7724 | } | ||
| 7725 | |||
| 7629 | } | 7726 | } |
| 7630 | 7727 | ||
| 7631 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
| 7729 | if (sd) { | ||
| 7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
| 7731 | check_cpu_capacity(rq, sd)) { | ||
| 7732 | kick = true; | ||
| 7733 | goto unlock; | ||
| 7734 | } | ||
| 7735 | } | ||
| 7632 | 7736 | ||
| 7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 7633 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
| 7634 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
| 7635 | goto need_kick_unlock; | 7740 | kick = true; |
| 7636 | 7741 | goto unlock; | |
| 7637 | rcu_read_unlock(); | 7742 | } |
| 7638 | return 0; | ||
| 7639 | 7743 | ||
| 7640 | need_kick_unlock: | 7744 | unlock: |
| 7641 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
| 7642 | need_kick: | 7746 | return kick; |
| 7643 | return 1; | ||
| 7644 | } | 7747 | } |
| 7645 | #else | 7748 | #else |
| 7646 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
| @@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 7656 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| 7657 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
| 7658 | 7761 | ||
| 7659 | rebalance_domains(this_rq, idle); | ||
| 7660 | |||
| 7661 | /* | 7762 | /* |
| 7662 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
| 7663 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
| 7664 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
| 7766 | * give the idle cpus a chance to load balance. Else we may | ||
| 7767 | * load balance only within the local sched_domain hierarchy | ||
| 7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
| 7665 | */ | 7769 | */ |
| 7666 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
| 7771 | rebalance_domains(this_rq, idle); | ||
| 7667 | } | 7772 | } |
| 7668 | 7773 | ||
| 7669 | /* | 7774 | /* |
