aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c775
1 files changed, 505 insertions, 270 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..502e95a6e927 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif 115#endif
116 116
117/*
118 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin
120 */
121unsigned int capacity_margin = 1280; /* ~20% */
122
117static inline void update_load_add(struct load_weight *lw, unsigned long inc) 123static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118{ 124{
119 lw->weight += inc; 125 lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
256 262
257static inline struct task_struct *task_of(struct sched_entity *se) 263static inline struct task_struct *task_of(struct sched_entity *se)
258{ 264{
259#ifdef CONFIG_SCHED_DEBUG 265 SCHED_WARN_ON(!entity_is_task(se));
260 WARN_ON_ONCE(!entity_is_task(se));
261#endif
262 return container_of(se, struct task_struct, se); 266 return container_of(se, struct task_struct, se);
263} 267}
264 268
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
456 460
457static void update_min_vruntime(struct cfs_rq *cfs_rq) 461static void update_min_vruntime(struct cfs_rq *cfs_rq)
458{ 462{
463 struct sched_entity *curr = cfs_rq->curr;
464
459 u64 vruntime = cfs_rq->min_vruntime; 465 u64 vruntime = cfs_rq->min_vruntime;
460 466
461 if (cfs_rq->curr) 467 if (curr) {
462 vruntime = cfs_rq->curr->vruntime; 468 if (curr->on_rq)
469 vruntime = curr->vruntime;
470 else
471 curr = NULL;
472 }
463 473
464 if (cfs_rq->rb_leftmost) { 474 if (cfs_rq->rb_leftmost) {
465 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, 475 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
466 struct sched_entity, 476 struct sched_entity,
467 run_node); 477 run_node);
468 478
469 if (!cfs_rq->curr) 479 if (!curr)
470 vruntime = se->vruntime; 480 vruntime = se->vruntime;
471 else 481 else
472 vruntime = min_vruntime(vruntime, se->vruntime); 482 vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
656} 666}
657 667
658#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
659static int select_idle_sibling(struct task_struct *p, int cpu); 669static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
660static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
661 671
662/* 672/*
@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
726 struct sched_avg *sa = &se->avg; 736 struct sched_avg *sa = &se->avg;
727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 737 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq); 738 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
730 739
731 if (cap > 0) { 740 if (cap > 0) {
732 if (cfs_rq->avg.util_avg != 0) { 741 if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
759 } 768 }
760 } 769 }
761 770
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 771 update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se); 772 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update) 773 update_tg_load_avg(cfs_rq, false);
765 update_tg_load_avg(cfs_rq, false);
766} 774}
767 775
768#else /* !CONFIG_SMP */ 776#else /* !CONFIG_SMP */
@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
799 max(delta_exec, curr->statistics.exec_max)); 807 max(delta_exec, curr->statistics.exec_max));
800 808
801 curr->sum_exec_runtime += delta_exec; 809 curr->sum_exec_runtime += delta_exec;
802 schedstat_add(cfs_rq, exec_clock, delta_exec); 810 schedstat_add(cfs_rq->exec_clock, delta_exec);
803 811
804 curr->vruntime += calc_delta_fair(delta_exec, curr); 812 curr->vruntime += calc_delta_fair(delta_exec, curr);
805 update_min_vruntime(cfs_rq); 813 update_min_vruntime(cfs_rq);
@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq)
820 update_curr(cfs_rq_of(&rq->curr->se)); 828 update_curr(cfs_rq_of(&rq->curr->se));
821} 829}
822 830
823#ifdef CONFIG_SCHEDSTATS
824static inline void 831static inline void
825update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 832update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
826{ 833{
827 u64 wait_start = rq_clock(rq_of(cfs_rq)); 834 u64 wait_start, prev_wait_start;
835
836 if (!schedstat_enabled())
837 return;
838
839 wait_start = rq_clock(rq_of(cfs_rq));
840 prev_wait_start = schedstat_val(se->statistics.wait_start);
828 841
829 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && 842 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
830 likely(wait_start > se->statistics.wait_start)) 843 likely(wait_start > prev_wait_start))
831 wait_start -= se->statistics.wait_start; 844 wait_start -= prev_wait_start;
832 845
833 se->statistics.wait_start = wait_start; 846 schedstat_set(se->statistics.wait_start, wait_start);
834} 847}
835 848
836static void 849static inline void
837update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 850update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
838{ 851{
839 struct task_struct *p; 852 struct task_struct *p;
840 u64 delta; 853 u64 delta;
841 854
842 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 855 if (!schedstat_enabled())
856 return;
857
858 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
843 859
844 if (entity_is_task(se)) { 860 if (entity_is_task(se)) {
845 p = task_of(se); 861 p = task_of(se);
@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
849 * time stamp can be adjusted to accumulate wait time 865 * time stamp can be adjusted to accumulate wait time
850 * prior to migration. 866 * prior to migration.
851 */ 867 */
852 se->statistics.wait_start = delta; 868 schedstat_set(se->statistics.wait_start, delta);
853 return; 869 return;
854 } 870 }
855 trace_sched_stat_wait(p, delta); 871 trace_sched_stat_wait(p, delta);
856 } 872 }
857 873
858 se->statistics.wait_max = max(se->statistics.wait_max, delta); 874 schedstat_set(se->statistics.wait_max,
859 se->statistics.wait_count++; 875 max(schedstat_val(se->statistics.wait_max), delta));
860 se->statistics.wait_sum += delta; 876 schedstat_inc(se->statistics.wait_count);
861 se->statistics.wait_start = 0; 877 schedstat_add(se->statistics.wait_sum, delta);
878 schedstat_set(se->statistics.wait_start, 0);
879}
880
881static inline void
882update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
883{
884 struct task_struct *tsk = NULL;
885 u64 sleep_start, block_start;
886
887 if (!schedstat_enabled())
888 return;
889
890 sleep_start = schedstat_val(se->statistics.sleep_start);
891 block_start = schedstat_val(se->statistics.block_start);
892
893 if (entity_is_task(se))
894 tsk = task_of(se);
895
896 if (sleep_start) {
897 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
898
899 if ((s64)delta < 0)
900 delta = 0;
901
902 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
903 schedstat_set(se->statistics.sleep_max, delta);
904
905 schedstat_set(se->statistics.sleep_start, 0);
906 schedstat_add(se->statistics.sum_sleep_runtime, delta);
907
908 if (tsk) {
909 account_scheduler_latency(tsk, delta >> 10, 1);
910 trace_sched_stat_sleep(tsk, delta);
911 }
912 }
913 if (block_start) {
914 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
915
916 if ((s64)delta < 0)
917 delta = 0;
918
919 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
920 schedstat_set(se->statistics.block_max, delta);
921
922 schedstat_set(se->statistics.block_start, 0);
923 schedstat_add(se->statistics.sum_sleep_runtime, delta);
924
925 if (tsk) {
926 if (tsk->in_iowait) {
927 schedstat_add(se->statistics.iowait_sum, delta);
928 schedstat_inc(se->statistics.iowait_count);
929 trace_sched_stat_iowait(tsk, delta);
930 }
931
932 trace_sched_stat_blocked(tsk, delta);
933
934 /*
935 * Blocking time is in units of nanosecs, so shift by
936 * 20 to get a milliseconds-range estimation of the
937 * amount of time that the task spent sleeping:
938 */
939 if (unlikely(prof_on == SLEEP_PROFILING)) {
940 profile_hits(SLEEP_PROFILING,
941 (void *)get_wchan(tsk),
942 delta >> 20);
943 }
944 account_scheduler_latency(tsk, delta >> 10, 0);
945 }
946 }
862} 947}
863 948
864/* 949/*
865 * Task is being enqueued - update stats: 950 * Task is being enqueued - update stats:
866 */ 951 */
867static inline void 952static inline void
868update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 953update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
869{ 954{
955 if (!schedstat_enabled())
956 return;
957
870 /* 958 /*
871 * Are we enqueueing a waiting task? (for current tasks 959 * Are we enqueueing a waiting task? (for current tasks
872 * a dequeue/enqueue event is a NOP) 960 * a dequeue/enqueue event is a NOP)
873 */ 961 */
874 if (se != cfs_rq->curr) 962 if (se != cfs_rq->curr)
875 update_stats_wait_start(cfs_rq, se); 963 update_stats_wait_start(cfs_rq, se);
964
965 if (flags & ENQUEUE_WAKEUP)
966 update_stats_enqueue_sleeper(cfs_rq, se);
876} 967}
877 968
878static inline void 969static inline void
879update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 970update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
880{ 971{
972
973 if (!schedstat_enabled())
974 return;
975
881 /* 976 /*
882 * Mark the end of the wait period if dequeueing a 977 * Mark the end of the wait period if dequeueing a
883 * waiting task: 978 * waiting task:
@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
885 if (se != cfs_rq->curr) 980 if (se != cfs_rq->curr)
886 update_stats_wait_end(cfs_rq, se); 981 update_stats_wait_end(cfs_rq, se);
887 982
888 if (flags & DEQUEUE_SLEEP) { 983 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
889 if (entity_is_task(se)) { 984 struct task_struct *tsk = task_of(se);
890 struct task_struct *tsk = task_of(se);
891 985
892 if (tsk->state & TASK_INTERRUPTIBLE) 986 if (tsk->state & TASK_INTERRUPTIBLE)
893 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); 987 schedstat_set(se->statistics.sleep_start,
894 if (tsk->state & TASK_UNINTERRUPTIBLE) 988 rq_clock(rq_of(cfs_rq)));
895 se->statistics.block_start = rq_clock(rq_of(cfs_rq)); 989 if (tsk->state & TASK_UNINTERRUPTIBLE)
896 } 990 schedstat_set(se->statistics.block_start,
991 rq_clock(rq_of(cfs_rq)));
897 } 992 }
898
899}
900#else
901static inline void
902update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
903{
904}
905
906static inline void
907update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
908{
909} 993}
910 994
911static inline void
912update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
913{
914}
915
916static inline void
917update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
918{
919}
920#endif
921
922/* 995/*
923 * We are picking a new current task - update its stats: 996 * We are picking a new current task - update its stats:
924 */ 997 */
@@ -1513,8 +1586,16 @@ balance:
1513 * One idle CPU per node is evaluated for a task numa move. 1586 * One idle CPU per node is evaluated for a task numa move.
1514 * Call select_idle_sibling to maybe find a better one. 1587 * Call select_idle_sibling to maybe find a better one.
1515 */ 1588 */
1516 if (!cur) 1589 if (!cur) {
1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); 1590 /*
1591 * select_idle_siblings() uses an per-cpu cpumask that
1592 * can be used from IRQ context.
1593 */
1594 local_irq_disable();
1595 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1596 env->dst_cpu);
1597 local_irq_enable();
1598 }
1518 1599
1519assign: 1600assign:
1520 task_numa_assign(env, cur, imp); 1601 task_numa_assign(env, cur, imp);
@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
2292 unsigned long nr_pte_updates = 0; 2373 unsigned long nr_pte_updates = 0;
2293 long pages, virtpages; 2374 long pages, virtpages;
2294 2375
2295 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 2376 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2296 2377
2297 work->next = work; /* protect against double add */ 2378 work->next = work; /* protect against double add */
2298 /* 2379 /*
@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2803} 2884}
2804 2885
2805#ifdef CONFIG_FAIR_GROUP_SCHED 2886#ifdef CONFIG_FAIR_GROUP_SCHED
2806/* 2887/**
2807 * Updating tg's load_avg is necessary before update_cfs_share (which is done) 2888 * update_tg_load_avg - update the tg's load avg
2808 * and effective_load (which is not done because it is too costly). 2889 * @cfs_rq: the cfs_rq whose avg changed
2890 * @force: update regardless of how small the difference
2891 *
2892 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2893 * However, because tg->load_avg is a global value there are performance
2894 * considerations.
2895 *
2896 * In order to avoid having to look at the other cfs_rq's, we use a
2897 * differential update where we store the last value we propagated. This in
2898 * turn allows skipping updates if the differential is 'small'.
2899 *
2900 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2901 * done) and effective_load() (which is not done because it is too costly).
2809 */ 2902 */
2810static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 2903static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2811{ 2904{
@@ -2875,12 +2968,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2875 2968
2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2969static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2877{ 2970{
2878 struct rq *rq = rq_of(cfs_rq); 2971 if (&this_rq()->cfs == cfs_rq) {
2879 int cpu = cpu_of(rq);
2880
2881 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2882 unsigned long max = rq->cpu_capacity_orig;
2883
2884 /* 2972 /*
2885 * There are a few boundary cases this might miss but it should 2973 * There are a few boundary cases this might miss but it should
2886 * get called often enough that that should (hopefully) not be 2974 * get called often enough that that should (hopefully) not be
@@ -2897,8 +2985,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2897 * 2985 *
2898 * See cpu_util(). 2986 * See cpu_util().
2899 */ 2987 */
2900 cpufreq_update_util(rq_clock(rq), 2988 cpufreq_update_util(rq_of(cfs_rq), 0);
2901 min(cfs_rq->avg.util_avg, max), max);
2902 } 2989 }
2903} 2990}
2904 2991
@@ -2931,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2931 * 3018 *
2932 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3019 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2933 * 3020 *
2934 * Returns true if the load decayed or we removed utilization. It is expected 3021 * Returns true if the load decayed or we removed load.
2935 * that one calls update_tg_load_avg() on this condition, but after you've 3022 *
2936 * modified the cfs_rq avg (attach/detach), such that we propagate the new 3023 * Since both these conditions indicate a changed cfs_rq->avg.load we should
2937 * avg up. 3024 * call update_tg_load_avg() when this function returns true.
2938 */ 3025 */
2939static inline int 3026static inline int
2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 3027update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3159,10 +3246,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3159 3246
3160static inline void update_load_avg(struct sched_entity *se, int not_used) 3247static inline void update_load_avg(struct sched_entity *se, int not_used)
3161{ 3248{
3162 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3249 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
3163 struct rq *rq = rq_of(cfs_rq);
3164
3165 cpufreq_trigger_update(rq_clock(rq));
3166} 3250}
3167 3251
3168static inline void 3252static inline void
@@ -3183,68 +3267,6 @@ static inline int idle_balance(struct rq *rq)
3183 3267
3184#endif /* CONFIG_SMP */ 3268#endif /* CONFIG_SMP */
3185 3269
3186static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3187{
3188#ifdef CONFIG_SCHEDSTATS
3189 struct task_struct *tsk = NULL;
3190
3191 if (entity_is_task(se))
3192 tsk = task_of(se);
3193
3194 if (se->statistics.sleep_start) {
3195 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3196
3197 if ((s64)delta < 0)
3198 delta = 0;
3199
3200 if (unlikely(delta > se->statistics.sleep_max))
3201 se->statistics.sleep_max = delta;
3202
3203 se->statistics.sleep_start = 0;
3204 se->statistics.sum_sleep_runtime += delta;
3205
3206 if (tsk) {
3207 account_scheduler_latency(tsk, delta >> 10, 1);
3208 trace_sched_stat_sleep(tsk, delta);
3209 }
3210 }
3211 if (se->statistics.block_start) {
3212 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3213
3214 if ((s64)delta < 0)
3215 delta = 0;
3216
3217 if (unlikely(delta > se->statistics.block_max))
3218 se->statistics.block_max = delta;
3219
3220 se->statistics.block_start = 0;
3221 se->statistics.sum_sleep_runtime += delta;
3222
3223 if (tsk) {
3224 if (tsk->in_iowait) {
3225 se->statistics.iowait_sum += delta;
3226 se->statistics.iowait_count++;
3227 trace_sched_stat_iowait(tsk, delta);
3228 }
3229
3230 trace_sched_stat_blocked(tsk, delta);
3231
3232 /*
3233 * Blocking time is in units of nanosecs, so shift by
3234 * 20 to get a milliseconds-range estimation of the
3235 * amount of time that the task spent sleeping:
3236 */
3237 if (unlikely(prof_on == SLEEP_PROFILING)) {
3238 profile_hits(SLEEP_PROFILING,
3239 (void *)get_wchan(tsk),
3240 delta >> 20);
3241 }
3242 account_scheduler_latency(tsk, delta >> 10, 0);
3243 }
3244 }
3245#endif
3246}
3247
3248static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 3270static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3249{ 3271{
3250#ifdef CONFIG_SCHED_DEBUG 3272#ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3254 d = -d; 3276 d = -d;
3255 3277
3256 if (d > 3*sysctl_sched_latency) 3278 if (d > 3*sysctl_sched_latency)
3257 schedstat_inc(cfs_rq, nr_spread_over); 3279 schedstat_inc(cfs_rq->nr_spread_over);
3258#endif 3280#endif
3259} 3281}
3260 3282
@@ -3371,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3371 account_entity_enqueue(cfs_rq, se); 3393 account_entity_enqueue(cfs_rq, se);
3372 update_cfs_shares(cfs_rq); 3394 update_cfs_shares(cfs_rq);
3373 3395
3374 if (flags & ENQUEUE_WAKEUP) { 3396 if (flags & ENQUEUE_WAKEUP)
3375 place_entity(cfs_rq, se, 0); 3397 place_entity(cfs_rq, se, 0);
3376 if (schedstat_enabled())
3377 enqueue_sleeper(cfs_rq, se);
3378 }
3379 3398
3380 check_schedstat_required(); 3399 check_schedstat_required();
3381 if (schedstat_enabled()) { 3400 update_stats_enqueue(cfs_rq, se, flags);
3382 update_stats_enqueue(cfs_rq, se); 3401 check_spread(cfs_rq, se);
3383 check_spread(cfs_rq, se);
3384 }
3385 if (!curr) 3402 if (!curr)
3386 __enqueue_entity(cfs_rq, se); 3403 __enqueue_entity(cfs_rq, se);
3387 se->on_rq = 1; 3404 se->on_rq = 1;
@@ -3448,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3448 update_curr(cfs_rq); 3465 update_curr(cfs_rq);
3449 dequeue_entity_load_avg(cfs_rq, se); 3466 dequeue_entity_load_avg(cfs_rq, se);
3450 3467
3451 if (schedstat_enabled()) 3468 update_stats_dequeue(cfs_rq, se, flags);
3452 update_stats_dequeue(cfs_rq, se, flags);
3453 3469
3454 clear_buddies(cfs_rq, se); 3470 clear_buddies(cfs_rq, se);
3455 3471
@@ -3459,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3459 account_entity_dequeue(cfs_rq, se); 3475 account_entity_dequeue(cfs_rq, se);
3460 3476
3461 /* 3477 /*
3462 * Normalize the entity after updating the min_vruntime because the 3478 * Normalize after update_curr(); which will also have moved
3463 * update can refer to the ->curr item and we need to reflect this 3479 * min_vruntime if @se is the one holding it back. But before doing
3464 * movement in our normalized position. 3480 * update_min_vruntime() again, which will discount @se's position and
3481 * can move min_vruntime forward still more.
3465 */ 3482 */
3466 if (!(flags & DEQUEUE_SLEEP)) 3483 if (!(flags & DEQUEUE_SLEEP))
3467 se->vruntime -= cfs_rq->min_vruntime; 3484 se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3469 /* return excess runtime on last dequeue */ 3486 /* return excess runtime on last dequeue */
3470 return_cfs_rq_runtime(cfs_rq); 3487 return_cfs_rq_runtime(cfs_rq);
3471 3488
3472 update_min_vruntime(cfs_rq);
3473 update_cfs_shares(cfs_rq); 3489 update_cfs_shares(cfs_rq);
3490
3491 /*
3492 * Now advance min_vruntime if @se was the entity holding it back,
3493 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3494 * put back on, and if we advance min_vruntime, we'll be placed back
3495 * further than we started -- ie. we'll be penalized.
3496 */
3497 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
3498 update_min_vruntime(cfs_rq);
3474} 3499}
3475 3500
3476/* 3501/*
@@ -3523,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3523 * a CPU. So account for the time it spent waiting on the 3548 * a CPU. So account for the time it spent waiting on the
3524 * runqueue. 3549 * runqueue.
3525 */ 3550 */
3526 if (schedstat_enabled()) 3551 update_stats_wait_end(cfs_rq, se);
3527 update_stats_wait_end(cfs_rq, se);
3528 __dequeue_entity(cfs_rq, se); 3552 __dequeue_entity(cfs_rq, se);
3529 update_load_avg(se, 1); 3553 update_load_avg(se, 1);
3530 } 3554 }
3531 3555
3532 update_stats_curr_start(cfs_rq, se); 3556 update_stats_curr_start(cfs_rq, se);
3533 cfs_rq->curr = se; 3557 cfs_rq->curr = se;
3534#ifdef CONFIG_SCHEDSTATS 3558
3535 /* 3559 /*
3536 * Track our maximum slice length, if the CPU's load is at 3560 * Track our maximum slice length, if the CPU's load is at
3537 * least twice that of our own weight (i.e. dont track it 3561 * least twice that of our own weight (i.e. dont track it
3538 * when there are only lesser-weight tasks around): 3562 * when there are only lesser-weight tasks around):
3539 */ 3563 */
3540 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3564 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3541 se->statistics.slice_max = max(se->statistics.slice_max, 3565 schedstat_set(se->statistics.slice_max,
3542 se->sum_exec_runtime - se->prev_sum_exec_runtime); 3566 max((u64)schedstat_val(se->statistics.slice_max),
3567 se->sum_exec_runtime - se->prev_sum_exec_runtime));
3543 } 3568 }
3544#endif 3569
3545 se->prev_sum_exec_runtime = se->sum_exec_runtime; 3570 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3546} 3571}
3547 3572
@@ -3620,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3620 /* throttle cfs_rqs exceeding runtime */ 3645 /* throttle cfs_rqs exceeding runtime */
3621 check_cfs_rq_runtime(cfs_rq); 3646 check_cfs_rq_runtime(cfs_rq);
3622 3647
3623 if (schedstat_enabled()) { 3648 check_spread(cfs_rq, prev);
3624 check_spread(cfs_rq, prev);
3625 if (prev->on_rq)
3626 update_stats_wait_start(cfs_rq, prev);
3627 }
3628 3649
3629 if (prev->on_rq) { 3650 if (prev->on_rq) {
3651 update_stats_wait_start(cfs_rq, prev);
3630 /* Put 'current' back into the tree. */ 3652 /* Put 'current' back into the tree. */
3631 __enqueue_entity(cfs_rq, prev); 3653 __enqueue_entity(cfs_rq, prev);
3632 /* in !on_rq case, update occurred at dequeue */ 3654 /* in !on_rq case, update occurred at dequeue */
@@ -4456,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4456 struct sched_entity *se = &p->se; 4478 struct sched_entity *se = &p->se;
4457 struct cfs_rq *cfs_rq = cfs_rq_of(se); 4479 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4458 4480
4459 WARN_ON(task_rq(p) != rq); 4481 SCHED_WARN_ON(task_rq(p) != rq);
4460 4482
4461 if (cfs_rq->nr_running > 1) { 4483 if (rq->cfs.h_nr_running > 1) {
4462 u64 slice = sched_slice(cfs_rq, se); 4484 u64 slice = sched_slice(cfs_rq, se);
4463 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 4485 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4464 s64 delta = slice - ran; 4486 s64 delta = slice - ran;
@@ -4509,6 +4531,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4509 struct cfs_rq *cfs_rq; 4531 struct cfs_rq *cfs_rq;
4510 struct sched_entity *se = &p->se; 4532 struct sched_entity *se = &p->se;
4511 4533
4534 /*
4535 * If in_iowait is set, the code below may not trigger any cpufreq
4536 * utilization updates, so do it here explicitly with the IOWAIT flag
4537 * passed.
4538 */
4539 if (p->in_iowait)
4540 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
4541
4512 for_each_sched_entity(se) { 4542 for_each_sched_entity(se) {
4513 if (se->on_rq) 4543 if (se->on_rq)
4514 break; 4544 break;
@@ -4605,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4605} 4635}
4606 4636
4607#ifdef CONFIG_SMP 4637#ifdef CONFIG_SMP
4638
4639/* Working cpumask for: load_balance, load_balance_newidle. */
4640DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4641DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4642
4608#ifdef CONFIG_NO_HZ_COMMON 4643#ifdef CONFIG_NO_HZ_COMMON
4609/* 4644/*
4610 * per rq 'load' arrray crap; XXX kill this. 4645 * per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5006 * wl = S * s'_i; see (2) 5041 * wl = S * s'_i; see (2)
5007 */ 5042 */
5008 if (W > 0 && w < W) 5043 if (W > 0 && w < W)
5009 wl = (w * (long)tg->shares) / W; 5044 wl = (w * (long)scale_load_down(tg->shares)) / W;
5010 else 5045 else
5011 wl = tg->shares; 5046 wl = scale_load_down(tg->shares);
5012 5047
5013 /* 5048 /*
5014 * Per the above, wl is the new se->load.weight value; since 5049 * Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5126,18 @@ static int wake_wide(struct task_struct *p)
5091 return 1; 5126 return 1;
5092} 5127}
5093 5128
5094static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 5129static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5130 int prev_cpu, int sync)
5095{ 5131{
5096 s64 this_load, load; 5132 s64 this_load, load;
5097 s64 this_eff_load, prev_eff_load; 5133 s64 this_eff_load, prev_eff_load;
5098 int idx, this_cpu, prev_cpu; 5134 int idx, this_cpu;
5099 struct task_group *tg; 5135 struct task_group *tg;
5100 unsigned long weight; 5136 unsigned long weight;
5101 int balanced; 5137 int balanced;
5102 5138
5103 idx = sd->wake_idx; 5139 idx = sd->wake_idx;
5104 this_cpu = smp_processor_id(); 5140 this_cpu = smp_processor_id();
5105 prev_cpu = task_cpu(p);
5106 load = source_load(prev_cpu, idx); 5141 load = source_load(prev_cpu, idx);
5107 this_load = target_load(this_cpu, idx); 5142 this_load = target_load(this_cpu, idx);
5108 5143
@@ -5146,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
5146 5181
5147 balanced = this_eff_load <= prev_eff_load; 5182 balanced = this_eff_load <= prev_eff_load;
5148 5183
5149 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 5184 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5150 5185
5151 if (!balanced) 5186 if (!balanced)
5152 return 0; 5187 return 0;
5153 5188
5154 schedstat_inc(sd, ttwu_move_affine); 5189 schedstat_inc(sd->ttwu_move_affine);
5155 schedstat_inc(p, se.statistics.nr_wakeups_affine); 5190 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5156 5191
5157 return 1; 5192 return 1;
5158} 5193}
@@ -5228,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5228 int shallowest_idle_cpu = -1; 5263 int shallowest_idle_cpu = -1;
5229 int i; 5264 int i;
5230 5265
5266 /* Check if we have any choice: */
5267 if (group->group_weight == 1)
5268 return cpumask_first(sched_group_cpus(group));
5269
5231 /* Traverse only the allowed CPUs */ 5270 /* Traverse only the allowed CPUs */
5232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 5271 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5233 if (idle_cpu(i)) { 5272 if (idle_cpu(i)) {
@@ -5265,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5265} 5304}
5266 5305
5267/* 5306/*
5268 * Try and locate an idle CPU in the sched_domain. 5307 * Implement a for_each_cpu() variant that starts the scan at a given cpu
5308 * (@start), and wraps around.
5309 *
5310 * This is used to scan for idle CPUs; such that not all CPUs looking for an
5311 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
5312 * through the LLC domain.
5313 *
5314 * Especially tbench is found sensitive to this.
5315 */
5316
5317static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5318{
5319 int next;
5320
5321again:
5322 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5323
5324 if (*wrapped) {
5325 if (next >= start)
5326 return nr_cpumask_bits;
5327 } else {
5328 if (next >= nr_cpumask_bits) {
5329 *wrapped = 1;
5330 n = -1;
5331 goto again;
5332 }
5333 }
5334
5335 return next;
5336}
5337
5338#define for_each_cpu_wrap(cpu, mask, start, wrap) \
5339 for ((wrap) = 0, (cpu) = (start)-1; \
5340 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5341 (cpu) < nr_cpumask_bits; )
5342
5343#ifdef CONFIG_SCHED_SMT
5344
5345static inline void set_idle_cores(int cpu, int val)
5346{
5347 struct sched_domain_shared *sds;
5348
5349 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5350 if (sds)
5351 WRITE_ONCE(sds->has_idle_cores, val);
5352}
5353
5354static inline bool test_idle_cores(int cpu, bool def)
5355{
5356 struct sched_domain_shared *sds;
5357
5358 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5359 if (sds)
5360 return READ_ONCE(sds->has_idle_cores);
5361
5362 return def;
5363}
5364
5365/*
5366 * Scans the local SMT mask to see if the entire core is idle, and records this
5367 * information in sd_llc_shared->has_idle_cores.
5368 *
5369 * Since SMT siblings share all cache levels, inspecting this limited remote
5370 * state should be fairly cheap.
5371 */
5372void __update_idle_core(struct rq *rq)
5373{
5374 int core = cpu_of(rq);
5375 int cpu;
5376
5377 rcu_read_lock();
5378 if (test_idle_cores(core, true))
5379 goto unlock;
5380
5381 for_each_cpu(cpu, cpu_smt_mask(core)) {
5382 if (cpu == core)
5383 continue;
5384
5385 if (!idle_cpu(cpu))
5386 goto unlock;
5387 }
5388
5389 set_idle_cores(core, 1);
5390unlock:
5391 rcu_read_unlock();
5392}
5393
5394/*
5395 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5396 * there are no idle cores left in the system; tracked through
5397 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5398 */
5399static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5400{
5401 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5402 int core, cpu, wrap;
5403
5404 if (!static_branch_likely(&sched_smt_present))
5405 return -1;
5406
5407 if (!test_idle_cores(target, false))
5408 return -1;
5409
5410 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
5411
5412 for_each_cpu_wrap(core, cpus, target, wrap) {
5413 bool idle = true;
5414
5415 for_each_cpu(cpu, cpu_smt_mask(core)) {
5416 cpumask_clear_cpu(cpu, cpus);
5417 if (!idle_cpu(cpu))
5418 idle = false;
5419 }
5420
5421 if (idle)
5422 return core;
5423 }
5424
5425 /*
5426 * Failed to find an idle core; stop looking for one.
5427 */
5428 set_idle_cores(target, 0);
5429
5430 return -1;
5431}
5432
5433/*
5434 * Scan the local SMT mask for idle CPUs.
5269 */ 5435 */
5270static int select_idle_sibling(struct task_struct *p, int target) 5436static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5437{
5438 int cpu;
5439
5440 if (!static_branch_likely(&sched_smt_present))
5441 return -1;
5442
5443 for_each_cpu(cpu, cpu_smt_mask(target)) {
5444 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5445 continue;
5446 if (idle_cpu(cpu))
5447 return cpu;
5448 }
5449
5450 return -1;
5451}
5452
5453#else /* CONFIG_SCHED_SMT */
5454
5455static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5456{
5457 return -1;
5458}
5459
5460static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5461{
5462 return -1;
5463}
5464
5465#endif /* CONFIG_SCHED_SMT */
5466
5467/*
5468 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5469 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5470 * average idle time for this rq (as found in rq->avg_idle).
5471 */
5472static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5473{
5474 struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5475 u64 avg_idle = this_rq()->avg_idle;
5476 u64 avg_cost = this_sd->avg_scan_cost;
5477 u64 time, cost;
5478 s64 delta;
5479 int cpu, wrap;
5480
5481 /*
5482 * Due to large variance we need a large fuzz factor; hackbench in
5483 * particularly is sensitive here.
5484 */
5485 if ((avg_idle / 512) < avg_cost)
5486 return -1;
5487
5488 time = local_clock();
5489
5490 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
5491 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5492 continue;
5493 if (idle_cpu(cpu))
5494 break;
5495 }
5496
5497 time = local_clock() - time;
5498 cost = this_sd->avg_scan_cost;
5499 delta = (s64)(time - cost) / 8;
5500 this_sd->avg_scan_cost += delta;
5501
5502 return cpu;
5503}
5504
5505/*
5506 * Try and locate an idle core/thread in the LLC cache domain.
5507 */
5508static int select_idle_sibling(struct task_struct *p, int prev, int target)
5271{ 5509{
5272 struct sched_domain *sd; 5510 struct sched_domain *sd;
5273 struct sched_group *sg; 5511 int i;
5274 int i = task_cpu(p);
5275 5512
5276 if (idle_cpu(target)) 5513 if (idle_cpu(target))
5277 return target; 5514 return target;
5278 5515
5279 /* 5516 /*
5280 * If the prevous cpu is cache affine and idle, don't be stupid. 5517 * If the previous cpu is cache affine and idle, don't be stupid.
5281 */ 5518 */
5282 if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) 5519 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5283 return i; 5520 return prev;
5284 5521
5285 /*
5286 * Otherwise, iterate the domains and find an eligible idle cpu.
5287 *
5288 * A completely idle sched group at higher domains is more
5289 * desirable than an idle group at a lower level, because lower
5290 * domains have smaller groups and usually share hardware
5291 * resources which causes tasks to contend on them, e.g. x86
5292 * hyperthread siblings in the lowest domain (SMT) can contend
5293 * on the shared cpu pipeline.
5294 *
5295 * However, while we prefer idle groups at higher domains
5296 * finding an idle cpu at the lowest domain is still better than
5297 * returning 'target', which we've already established, isn't
5298 * idle.
5299 */
5300 sd = rcu_dereference(per_cpu(sd_llc, target)); 5522 sd = rcu_dereference(per_cpu(sd_llc, target));
5301 for_each_lower_domain(sd) { 5523 if (!sd)
5302 sg = sd->groups; 5524 return target;
5303 do { 5525
5304 if (!cpumask_intersects(sched_group_cpus(sg), 5526 i = select_idle_core(p, sd, target);
5305 tsk_cpus_allowed(p))) 5527 if ((unsigned)i < nr_cpumask_bits)
5306 goto next; 5528 return i;
5307 5529
5308 /* Ensure the entire group is idle */ 5530 i = select_idle_cpu(p, sd, target);
5309 for_each_cpu(i, sched_group_cpus(sg)) { 5531 if ((unsigned)i < nr_cpumask_bits)
5310 if (i == target || !idle_cpu(i)) 5532 return i;
5311 goto next; 5533
5312 } 5534 i = select_idle_smt(p, sd, target);
5535 if ((unsigned)i < nr_cpumask_bits)
5536 return i;
5313 5537
5314 /*
5315 * It doesn't matter which cpu we pick, the
5316 * whole group is idle.
5317 */
5318 target = cpumask_first_and(sched_group_cpus(sg),
5319 tsk_cpus_allowed(p));
5320 goto done;
5321next:
5322 sg = sg->next;
5323 } while (sg != sd->groups);
5324 }
5325done:
5326 return target; 5538 return target;
5327} 5539}
5328 5540
@@ -5360,6 +5572,32 @@ static int cpu_util(int cpu)
5360 return (util >= capacity) ? capacity : util; 5572 return (util >= capacity) ? capacity : util;
5361} 5573}
5362 5574
5575static inline int task_util(struct task_struct *p)
5576{
5577 return p->se.avg.util_avg;
5578}
5579
5580/*
5581 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5582 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5583 *
5584 * In that case WAKE_AFFINE doesn't make sense and we'll let
5585 * BALANCE_WAKE sort things out.
5586 */
5587static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5588{
5589 long min_cap, max_cap;
5590
5591 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5592 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5593
5594 /* Minimum capacity is close to max, no need to abort wake_affine */
5595 if (max_cap - min_cap < max_cap >> 3)
5596 return 0;
5597
5598 return min_cap * 1024 < task_util(p) * capacity_margin;
5599}
5600
5363/* 5601/*
5364 * select_task_rq_fair: Select target runqueue for the waking task in domains 5602 * select_task_rq_fair: Select target runqueue for the waking task in domains
5365 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 5603 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5383 5621
5384 if (sd_flag & SD_BALANCE_WAKE) { 5622 if (sd_flag & SD_BALANCE_WAKE) {
5385 record_wakee(p); 5623 record_wakee(p);
5386 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 5624 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5625 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5387 } 5626 }
5388 5627
5389 rcu_read_lock(); 5628 rcu_read_lock();
@@ -5409,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5409 5648
5410 if (affine_sd) { 5649 if (affine_sd) {
5411 sd = NULL; /* Prefer wake_affine over balance flags */ 5650 sd = NULL; /* Prefer wake_affine over balance flags */
5412 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 5651 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
5413 new_cpu = cpu; 5652 new_cpu = cpu;
5414 } 5653 }
5415 5654
5416 if (!sd) { 5655 if (!sd) {
5417 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 5656 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5418 new_cpu = select_idle_sibling(p, new_cpu); 5657 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
5419 5658
5420 } else while (sd) { 5659 } else while (sd) {
5421 struct sched_group *group; 5660 struct sched_group *group;
@@ -5939,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5939 * 6178 *
5940 * The adjacency matrix of the resulting graph is given by: 6179 * The adjacency matrix of the resulting graph is given by:
5941 * 6180 *
5942 * log_2 n 6181 * log_2 n
5943 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) 6182 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5944 * k = 0 6183 * k = 0
5945 * 6184 *
@@ -5985,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
5985 * 6224 *
5986 * [XXX write more on how we solve this.. _after_ merging pjt's patches that 6225 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5987 * rewrite all of this once again.] 6226 * rewrite all of this once again.]
5988 */ 6227 */
5989 6228
5990static unsigned long __read_mostly max_load_balance_interval = HZ/10; 6229static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5991 6230
@@ -6133,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6133 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 6372 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6134 int cpu; 6373 int cpu;
6135 6374
6136 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 6375 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
6137 6376
6138 env->flags |= LBF_SOME_PINNED; 6377 env->flags |= LBF_SOME_PINNED;
6139 6378
@@ -6164,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6164 env->flags &= ~LBF_ALL_PINNED; 6403 env->flags &= ~LBF_ALL_PINNED;
6165 6404
6166 if (task_running(env->src_rq, p)) { 6405 if (task_running(env->src_rq, p)) {
6167 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 6406 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
6168 return 0; 6407 return 0;
6169 } 6408 }
6170 6409
@@ -6181,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6181 if (tsk_cache_hot <= 0 || 6420 if (tsk_cache_hot <= 0 ||
6182 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 6421 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6183 if (tsk_cache_hot == 1) { 6422 if (tsk_cache_hot == 1) {
6184 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 6423 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
6185 schedstat_inc(p, se.statistics.nr_forced_migrations); 6424 schedstat_inc(p->se.statistics.nr_forced_migrations);
6186 } 6425 }
6187 return 1; 6426 return 1;
6188 } 6427 }
6189 6428
6190 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 6429 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
6191 return 0; 6430 return 0;
6192} 6431}
6193 6432
@@ -6227,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
6227 * so we can safely collect stats here rather than 6466 * so we can safely collect stats here rather than
6228 * inside detach_tasks(). 6467 * inside detach_tasks().
6229 */ 6468 */
6230 schedstat_inc(env->sd, lb_gained[env->idle]); 6469 schedstat_inc(env->sd->lb_gained[env->idle]);
6231 return p; 6470 return p;
6232 } 6471 }
6233 return NULL; 6472 return NULL;
@@ -6319,7 +6558,7 @@ next:
6319 * so we can safely collect detach_one_task() stats here rather 6558 * so we can safely collect detach_one_task() stats here rather
6320 * than inside detach_one_task(). 6559 * than inside detach_one_task().
6321 */ 6560 */
6322 schedstat_add(env->sd, lb_gained[env->idle], detached); 6561 schedstat_add(env->sd->lb_gained[env->idle], detached);
6323 6562
6324 return detached; 6563 return detached;
6325} 6564}
@@ -6647,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6647 /* 6886 /*
6648 * !SD_OVERLAP domains can assume that child groups 6887 * !SD_OVERLAP domains can assume that child groups
6649 * span the current group. 6888 * span the current group.
6650 */ 6889 */
6651 6890
6652 group = child->groups; 6891 group = child->groups;
6653 do { 6892 do {
@@ -7147,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
7147 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; 7386 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7148 if (load_above_capacity > busiest->group_capacity) { 7387 if (load_above_capacity > busiest->group_capacity) {
7149 load_above_capacity -= busiest->group_capacity; 7388 load_above_capacity -= busiest->group_capacity;
7150 load_above_capacity *= NICE_0_LOAD; 7389 load_above_capacity *= scale_load_down(NICE_0_LOAD);
7151 load_above_capacity /= busiest->group_capacity; 7390 load_above_capacity /= busiest->group_capacity;
7152 } else 7391 } else
7153 load_above_capacity = ~0UL; 7392 load_above_capacity = ~0UL;
@@ -7354,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
7354 */ 7593 */
7355#define MAX_PINNED_INTERVAL 512 7594#define MAX_PINNED_INTERVAL 512
7356 7595
7357/* Working cpumask for load_balance and load_balance_newidle. */
7358DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7359
7360static int need_active_balance(struct lb_env *env) 7596static int need_active_balance(struct lb_env *env)
7361{ 7597{
7362 struct sched_domain *sd = env->sd; 7598 struct sched_domain *sd = env->sd;
@@ -7460,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
7460 7696
7461 cpumask_copy(cpus, cpu_active_mask); 7697 cpumask_copy(cpus, cpu_active_mask);
7462 7698
7463 schedstat_inc(sd, lb_count[idle]); 7699 schedstat_inc(sd->lb_count[idle]);
7464 7700
7465redo: 7701redo:
7466 if (!should_we_balance(&env)) { 7702 if (!should_we_balance(&env)) {
@@ -7470,19 +7706,19 @@ redo:
7470 7706
7471 group = find_busiest_group(&env); 7707 group = find_busiest_group(&env);
7472 if (!group) { 7708 if (!group) {
7473 schedstat_inc(sd, lb_nobusyg[idle]); 7709 schedstat_inc(sd->lb_nobusyg[idle]);
7474 goto out_balanced; 7710 goto out_balanced;
7475 } 7711 }
7476 7712
7477 busiest = find_busiest_queue(&env, group); 7713 busiest = find_busiest_queue(&env, group);
7478 if (!busiest) { 7714 if (!busiest) {
7479 schedstat_inc(sd, lb_nobusyq[idle]); 7715 schedstat_inc(sd->lb_nobusyq[idle]);
7480 goto out_balanced; 7716 goto out_balanced;
7481 } 7717 }
7482 7718
7483 BUG_ON(busiest == env.dst_rq); 7719 BUG_ON(busiest == env.dst_rq);
7484 7720
7485 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 7721 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
7486 7722
7487 env.src_cpu = busiest->cpu; 7723 env.src_cpu = busiest->cpu;
7488 env.src_rq = busiest; 7724 env.src_rq = busiest;
@@ -7589,7 +7825,7 @@ more_balance:
7589 } 7825 }
7590 7826
7591 if (!ld_moved) { 7827 if (!ld_moved) {
7592 schedstat_inc(sd, lb_failed[idle]); 7828 schedstat_inc(sd->lb_failed[idle]);
7593 /* 7829 /*
7594 * Increment the failure counter only on periodic balance. 7830 * Increment the failure counter only on periodic balance.
7595 * We do not want newidle balance, which can be very 7831 * We do not want newidle balance, which can be very
@@ -7672,7 +7908,7 @@ out_all_pinned:
7672 * we can't migrate them. Let the imbalance flag set so parent level 7908 * we can't migrate them. Let the imbalance flag set so parent level
7673 * can try to migrate them. 7909 * can try to migrate them.
7674 */ 7910 */
7675 schedstat_inc(sd, lb_balanced[idle]); 7911 schedstat_inc(sd->lb_balanced[idle]);
7676 7912
7677 sd->nr_balance_failed = 0; 7913 sd->nr_balance_failed = 0;
7678 7914
@@ -7704,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7704} 7940}
7705 7941
7706static inline void 7942static inline void
7707update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) 7943update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
7708{ 7944{
7709 unsigned long interval, next; 7945 unsigned long interval, next;
7710 7946
7711 interval = get_sd_balance_interval(sd, cpu_busy); 7947 /* used by idle balance, so cpu_busy = 0 */
7948 interval = get_sd_balance_interval(sd, 0);
7712 next = sd->last_balance + interval; 7949 next = sd->last_balance + interval;
7713 7950
7714 if (time_after(*next_balance, next)) 7951 if (time_after(*next_balance, next))
@@ -7738,7 +7975,7 @@ static int idle_balance(struct rq *this_rq)
7738 rcu_read_lock(); 7975 rcu_read_lock();
7739 sd = rcu_dereference_check_sched_domain(this_rq->sd); 7976 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7740 if (sd) 7977 if (sd)
7741 update_next_balance(sd, 0, &next_balance); 7978 update_next_balance(sd, &next_balance);
7742 rcu_read_unlock(); 7979 rcu_read_unlock();
7743 7980
7744 goto out; 7981 goto out;
@@ -7756,7 +7993,7 @@ static int idle_balance(struct rq *this_rq)
7756 continue; 7993 continue;
7757 7994
7758 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { 7995 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7759 update_next_balance(sd, 0, &next_balance); 7996 update_next_balance(sd, &next_balance);
7760 break; 7997 break;
7761 } 7998 }
7762 7999
@@ -7774,7 +8011,7 @@ static int idle_balance(struct rq *this_rq)
7774 curr_cost += domain_cost; 8011 curr_cost += domain_cost;
7775 } 8012 }
7776 8013
7777 update_next_balance(sd, 0, &next_balance); 8014 update_next_balance(sd, &next_balance);
7778 8015
7779 /* 8016 /*
7780 * Stop searching for tasks to pull if there are 8017 * Stop searching for tasks to pull if there are
@@ -7864,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data)
7864 .idle = CPU_IDLE, 8101 .idle = CPU_IDLE,
7865 }; 8102 };
7866 8103
7867 schedstat_inc(sd, alb_count); 8104 schedstat_inc(sd->alb_count);
7868 8105
7869 p = detach_one_task(&env); 8106 p = detach_one_task(&env);
7870 if (p) { 8107 if (p) {
7871 schedstat_inc(sd, alb_pushed); 8108 schedstat_inc(sd->alb_pushed);
7872 /* Active balancing done, reset the failure counter. */ 8109 /* Active balancing done, reset the failure counter. */
7873 sd->nr_balance_failed = 0; 8110 sd->nr_balance_failed = 0;
7874 } else { 8111 } else {
7875 schedstat_inc(sd, alb_failed); 8112 schedstat_inc(sd->alb_failed);
7876 } 8113 }
7877 } 8114 }
7878 rcu_read_unlock(); 8115 rcu_read_unlock();
@@ -7964,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void)
7964 int cpu = smp_processor_id(); 8201 int cpu = smp_processor_id();
7965 8202
7966 rcu_read_lock(); 8203 rcu_read_lock();
7967 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8204 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7968 8205
7969 if (!sd || !sd->nohz_idle) 8206 if (!sd || !sd->nohz_idle)
7970 goto unlock; 8207 goto unlock;
7971 sd->nohz_idle = 0; 8208 sd->nohz_idle = 0;
7972 8209
7973 atomic_inc(&sd->groups->sgc->nr_busy_cpus); 8210 atomic_inc(&sd->shared->nr_busy_cpus);
7974unlock: 8211unlock:
7975 rcu_read_unlock(); 8212 rcu_read_unlock();
7976} 8213}
@@ -7981,13 +8218,13 @@ void set_cpu_sd_state_idle(void)
7981 int cpu = smp_processor_id(); 8218 int cpu = smp_processor_id();
7982 8219
7983 rcu_read_lock(); 8220 rcu_read_lock();
7984 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8221 sd = rcu_dereference(per_cpu(sd_llc, cpu));
7985 8222
7986 if (!sd || sd->nohz_idle) 8223 if (!sd || sd->nohz_idle)
7987 goto unlock; 8224 goto unlock;
7988 sd->nohz_idle = 1; 8225 sd->nohz_idle = 1;
7989 8226
7990 atomic_dec(&sd->groups->sgc->nr_busy_cpus); 8227 atomic_dec(&sd->shared->nr_busy_cpus);
7991unlock: 8228unlock:
7992 rcu_read_unlock(); 8229 rcu_read_unlock();
7993} 8230}
@@ -8214,8 +8451,8 @@ end:
8214static inline bool nohz_kick_needed(struct rq *rq) 8451static inline bool nohz_kick_needed(struct rq *rq)
8215{ 8452{
8216 unsigned long now = jiffies; 8453 unsigned long now = jiffies;
8454 struct sched_domain_shared *sds;
8217 struct sched_domain *sd; 8455 struct sched_domain *sd;
8218 struct sched_group_capacity *sgc;
8219 int nr_busy, cpu = rq->cpu; 8456 int nr_busy, cpu = rq->cpu;
8220 bool kick = false; 8457 bool kick = false;
8221 8458
@@ -8243,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
8243 return true; 8480 return true;
8244 8481
8245 rcu_read_lock(); 8482 rcu_read_lock();
8246 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 8483 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
8247 if (sd) { 8484 if (sds) {
8248 sgc = sd->groups->sgc; 8485 /*
8249 nr_busy = atomic_read(&sgc->nr_busy_cpus); 8486 * XXX: write a coherent comment on why we do this.
8250 8487 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
8488 */
8489 nr_busy = atomic_read(&sds->nr_busy_cpus);
8251 if (nr_busy > 1) { 8490 if (nr_busy > 1) {
8252 kick = true; 8491 kick = true;
8253 goto unlock; 8492 goto unlock;
@@ -8441,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
8441 struct sched_entity *se = &p->se; 8680 struct sched_entity *se = &p->se;
8442 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8681 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq); 8682 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8445 8683
8446 if (!vruntime_normalized(p)) { 8684 if (!vruntime_normalized(p)) {
8447 /* 8685 /*
@@ -8453,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
8453 } 8691 }
8454 8692
8455 /* Catch up with the cfs_rq and remove our load when we leave */ 8693 /* Catch up with the cfs_rq and remove our load when we leave */
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8694 update_cfs_rq_load_avg(now, cfs_rq, false);
8457 detach_entity_load_avg(cfs_rq, se); 8695 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update) 8696 update_tg_load_avg(cfs_rq, false);
8459 update_tg_load_avg(cfs_rq, false);
8460} 8697}
8461 8698
8462static void attach_task_cfs_rq(struct task_struct *p) 8699static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
8464 struct sched_entity *se = &p->se; 8701 struct sched_entity *se = &p->se;
8465 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8702 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq); 8703 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8468 8704
8469#ifdef CONFIG_FAIR_GROUP_SCHED 8705#ifdef CONFIG_FAIR_GROUP_SCHED
8470 /* 8706 /*
@@ -8475,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
8475#endif 8711#endif
8476 8712
8477 /* Synchronize task with its cfs_rq */ 8713 /* Synchronize task with its cfs_rq */
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); 8714 update_cfs_rq_load_avg(now, cfs_rq, false);
8479 attach_entity_load_avg(cfs_rq, se); 8715 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update) 8716 update_tg_load_avg(cfs_rq, false);
8481 update_tg_load_avg(cfs_rq, false);
8482 8717
8483 if (!vruntime_normalized(p)) 8718 if (!vruntime_normalized(p))
8484 se->vruntime += cfs_rq->min_vruntime; 8719 se->vruntime += cfs_rq->min_vruntime;