diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 775 |
1 files changed, 505 insertions, 270 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..502e95a6e927 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
115 | #endif | 115 | #endif |
116 | 116 | ||
117 | /* | ||
118 | * The margin used when comparing utilization with CPU capacity: | ||
119 | * util * 1024 < capacity * margin | ||
120 | */ | ||
121 | unsigned int capacity_margin = 1280; /* ~20% */ | ||
122 | |||
117 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 123 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
118 | { | 124 | { |
119 | lw->weight += inc; | 125 | lw->weight += inc; |
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
256 | 262 | ||
257 | static inline struct task_struct *task_of(struct sched_entity *se) | 263 | static inline struct task_struct *task_of(struct sched_entity *se) |
258 | { | 264 | { |
259 | #ifdef CONFIG_SCHED_DEBUG | 265 | SCHED_WARN_ON(!entity_is_task(se)); |
260 | WARN_ON_ONCE(!entity_is_task(se)); | ||
261 | #endif | ||
262 | return container_of(se, struct task_struct, se); | 266 | return container_of(se, struct task_struct, se); |
263 | } | 267 | } |
264 | 268 | ||
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, | |||
456 | 460 | ||
457 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 461 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
458 | { | 462 | { |
463 | struct sched_entity *curr = cfs_rq->curr; | ||
464 | |||
459 | u64 vruntime = cfs_rq->min_vruntime; | 465 | u64 vruntime = cfs_rq->min_vruntime; |
460 | 466 | ||
461 | if (cfs_rq->curr) | 467 | if (curr) { |
462 | vruntime = cfs_rq->curr->vruntime; | 468 | if (curr->on_rq) |
469 | vruntime = curr->vruntime; | ||
470 | else | ||
471 | curr = NULL; | ||
472 | } | ||
463 | 473 | ||
464 | if (cfs_rq->rb_leftmost) { | 474 | if (cfs_rq->rb_leftmost) { |
465 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, | 475 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, |
466 | struct sched_entity, | 476 | struct sched_entity, |
467 | run_node); | 477 | run_node); |
468 | 478 | ||
469 | if (!cfs_rq->curr) | 479 | if (!curr) |
470 | vruntime = se->vruntime; | 480 | vruntime = se->vruntime; |
471 | else | 481 | else |
472 | vruntime = min_vruntime(vruntime, se->vruntime); | 482 | vruntime = min_vruntime(vruntime, se->vruntime); |
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
656 | } | 666 | } |
657 | 667 | ||
658 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
659 | static int select_idle_sibling(struct task_struct *p, int cpu); | 669 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
660 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
661 | 671 | ||
662 | /* | 672 | /* |
@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
726 | struct sched_avg *sa = &se->avg; | 736 | struct sched_avg *sa = &se->avg; |
727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 737 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
728 | u64 now = cfs_rq_clock_task(cfs_rq); | 738 | u64 now = cfs_rq_clock_task(cfs_rq); |
729 | int tg_update; | ||
730 | 739 | ||
731 | if (cap > 0) { | 740 | if (cap > 0) { |
732 | if (cfs_rq->avg.util_avg != 0) { | 741 | if (cfs_rq->avg.util_avg != 0) { |
@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
759 | } | 768 | } |
760 | } | 769 | } |
761 | 770 | ||
762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 771 | update_cfs_rq_load_avg(now, cfs_rq, false); |
763 | attach_entity_load_avg(cfs_rq, se); | 772 | attach_entity_load_avg(cfs_rq, se); |
764 | if (tg_update) | 773 | update_tg_load_avg(cfs_rq, false); |
765 | update_tg_load_avg(cfs_rq, false); | ||
766 | } | 774 | } |
767 | 775 | ||
768 | #else /* !CONFIG_SMP */ | 776 | #else /* !CONFIG_SMP */ |
@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
799 | max(delta_exec, curr->statistics.exec_max)); | 807 | max(delta_exec, curr->statistics.exec_max)); |
800 | 808 | ||
801 | curr->sum_exec_runtime += delta_exec; | 809 | curr->sum_exec_runtime += delta_exec; |
802 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 810 | schedstat_add(cfs_rq->exec_clock, delta_exec); |
803 | 811 | ||
804 | curr->vruntime += calc_delta_fair(delta_exec, curr); | 812 | curr->vruntime += calc_delta_fair(delta_exec, curr); |
805 | update_min_vruntime(cfs_rq); | 813 | update_min_vruntime(cfs_rq); |
@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq) | |||
820 | update_curr(cfs_rq_of(&rq->curr->se)); | 828 | update_curr(cfs_rq_of(&rq->curr->se)); |
821 | } | 829 | } |
822 | 830 | ||
823 | #ifdef CONFIG_SCHEDSTATS | ||
824 | static inline void | 831 | static inline void |
825 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 832 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
826 | { | 833 | { |
827 | u64 wait_start = rq_clock(rq_of(cfs_rq)); | 834 | u64 wait_start, prev_wait_start; |
835 | |||
836 | if (!schedstat_enabled()) | ||
837 | return; | ||
838 | |||
839 | wait_start = rq_clock(rq_of(cfs_rq)); | ||
840 | prev_wait_start = schedstat_val(se->statistics.wait_start); | ||
828 | 841 | ||
829 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && | 842 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && |
830 | likely(wait_start > se->statistics.wait_start)) | 843 | likely(wait_start > prev_wait_start)) |
831 | wait_start -= se->statistics.wait_start; | 844 | wait_start -= prev_wait_start; |
832 | 845 | ||
833 | se->statistics.wait_start = wait_start; | 846 | schedstat_set(se->statistics.wait_start, wait_start); |
834 | } | 847 | } |
835 | 848 | ||
836 | static void | 849 | static inline void |
837 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 850 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
838 | { | 851 | { |
839 | struct task_struct *p; | 852 | struct task_struct *p; |
840 | u64 delta; | 853 | u64 delta; |
841 | 854 | ||
842 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 855 | if (!schedstat_enabled()) |
856 | return; | ||
857 | |||
858 | delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | ||
843 | 859 | ||
844 | if (entity_is_task(se)) { | 860 | if (entity_is_task(se)) { |
845 | p = task_of(se); | 861 | p = task_of(se); |
@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
849 | * time stamp can be adjusted to accumulate wait time | 865 | * time stamp can be adjusted to accumulate wait time |
850 | * prior to migration. | 866 | * prior to migration. |
851 | */ | 867 | */ |
852 | se->statistics.wait_start = delta; | 868 | schedstat_set(se->statistics.wait_start, delta); |
853 | return; | 869 | return; |
854 | } | 870 | } |
855 | trace_sched_stat_wait(p, delta); | 871 | trace_sched_stat_wait(p, delta); |
856 | } | 872 | } |
857 | 873 | ||
858 | se->statistics.wait_max = max(se->statistics.wait_max, delta); | 874 | schedstat_set(se->statistics.wait_max, |
859 | se->statistics.wait_count++; | 875 | max(schedstat_val(se->statistics.wait_max), delta)); |
860 | se->statistics.wait_sum += delta; | 876 | schedstat_inc(se->statistics.wait_count); |
861 | se->statistics.wait_start = 0; | 877 | schedstat_add(se->statistics.wait_sum, delta); |
878 | schedstat_set(se->statistics.wait_start, 0); | ||
879 | } | ||
880 | |||
881 | static inline void | ||
882 | update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
883 | { | ||
884 | struct task_struct *tsk = NULL; | ||
885 | u64 sleep_start, block_start; | ||
886 | |||
887 | if (!schedstat_enabled()) | ||
888 | return; | ||
889 | |||
890 | sleep_start = schedstat_val(se->statistics.sleep_start); | ||
891 | block_start = schedstat_val(se->statistics.block_start); | ||
892 | |||
893 | if (entity_is_task(se)) | ||
894 | tsk = task_of(se); | ||
895 | |||
896 | if (sleep_start) { | ||
897 | u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; | ||
898 | |||
899 | if ((s64)delta < 0) | ||
900 | delta = 0; | ||
901 | |||
902 | if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) | ||
903 | schedstat_set(se->statistics.sleep_max, delta); | ||
904 | |||
905 | schedstat_set(se->statistics.sleep_start, 0); | ||
906 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
907 | |||
908 | if (tsk) { | ||
909 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
910 | trace_sched_stat_sleep(tsk, delta); | ||
911 | } | ||
912 | } | ||
913 | if (block_start) { | ||
914 | u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; | ||
915 | |||
916 | if ((s64)delta < 0) | ||
917 | delta = 0; | ||
918 | |||
919 | if (unlikely(delta > schedstat_val(se->statistics.block_max))) | ||
920 | schedstat_set(se->statistics.block_max, delta); | ||
921 | |||
922 | schedstat_set(se->statistics.block_start, 0); | ||
923 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
924 | |||
925 | if (tsk) { | ||
926 | if (tsk->in_iowait) { | ||
927 | schedstat_add(se->statistics.iowait_sum, delta); | ||
928 | schedstat_inc(se->statistics.iowait_count); | ||
929 | trace_sched_stat_iowait(tsk, delta); | ||
930 | } | ||
931 | |||
932 | trace_sched_stat_blocked(tsk, delta); | ||
933 | |||
934 | /* | ||
935 | * Blocking time is in units of nanosecs, so shift by | ||
936 | * 20 to get a milliseconds-range estimation of the | ||
937 | * amount of time that the task spent sleeping: | ||
938 | */ | ||
939 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
940 | profile_hits(SLEEP_PROFILING, | ||
941 | (void *)get_wchan(tsk), | ||
942 | delta >> 20); | ||
943 | } | ||
944 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
945 | } | ||
946 | } | ||
862 | } | 947 | } |
863 | 948 | ||
864 | /* | 949 | /* |
865 | * Task is being enqueued - update stats: | 950 | * Task is being enqueued - update stats: |
866 | */ | 951 | */ |
867 | static inline void | 952 | static inline void |
868 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 953 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
869 | { | 954 | { |
955 | if (!schedstat_enabled()) | ||
956 | return; | ||
957 | |||
870 | /* | 958 | /* |
871 | * Are we enqueueing a waiting task? (for current tasks | 959 | * Are we enqueueing a waiting task? (for current tasks |
872 | * a dequeue/enqueue event is a NOP) | 960 | * a dequeue/enqueue event is a NOP) |
873 | */ | 961 | */ |
874 | if (se != cfs_rq->curr) | 962 | if (se != cfs_rq->curr) |
875 | update_stats_wait_start(cfs_rq, se); | 963 | update_stats_wait_start(cfs_rq, se); |
964 | |||
965 | if (flags & ENQUEUE_WAKEUP) | ||
966 | update_stats_enqueue_sleeper(cfs_rq, se); | ||
876 | } | 967 | } |
877 | 968 | ||
878 | static inline void | 969 | static inline void |
879 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 970 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
880 | { | 971 | { |
972 | |||
973 | if (!schedstat_enabled()) | ||
974 | return; | ||
975 | |||
881 | /* | 976 | /* |
882 | * Mark the end of the wait period if dequeueing a | 977 | * Mark the end of the wait period if dequeueing a |
883 | * waiting task: | 978 | * waiting task: |
@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
885 | if (se != cfs_rq->curr) | 980 | if (se != cfs_rq->curr) |
886 | update_stats_wait_end(cfs_rq, se); | 981 | update_stats_wait_end(cfs_rq, se); |
887 | 982 | ||
888 | if (flags & DEQUEUE_SLEEP) { | 983 | if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { |
889 | if (entity_is_task(se)) { | 984 | struct task_struct *tsk = task_of(se); |
890 | struct task_struct *tsk = task_of(se); | ||
891 | 985 | ||
892 | if (tsk->state & TASK_INTERRUPTIBLE) | 986 | if (tsk->state & TASK_INTERRUPTIBLE) |
893 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | 987 | schedstat_set(se->statistics.sleep_start, |
894 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 988 | rq_clock(rq_of(cfs_rq))); |
895 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | 989 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
896 | } | 990 | schedstat_set(se->statistics.block_start, |
991 | rq_clock(rq_of(cfs_rq))); | ||
897 | } | 992 | } |
898 | |||
899 | } | ||
900 | #else | ||
901 | static inline void | ||
902 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
903 | { | ||
904 | } | ||
905 | |||
906 | static inline void | ||
907 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
908 | { | ||
909 | } | 993 | } |
910 | 994 | ||
911 | static inline void | ||
912 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
913 | { | ||
914 | } | ||
915 | |||
916 | static inline void | ||
917 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
918 | { | ||
919 | } | ||
920 | #endif | ||
921 | |||
922 | /* | 995 | /* |
923 | * We are picking a new current task - update its stats: | 996 | * We are picking a new current task - update its stats: |
924 | */ | 997 | */ |
@@ -1513,8 +1586,16 @@ balance: | |||
1513 | * One idle CPU per node is evaluated for a task numa move. | 1586 | * One idle CPU per node is evaluated for a task numa move. |
1514 | * Call select_idle_sibling to maybe find a better one. | 1587 | * Call select_idle_sibling to maybe find a better one. |
1515 | */ | 1588 | */ |
1516 | if (!cur) | 1589 | if (!cur) { |
1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1590 | /* |
1591 | * select_idle_siblings() uses an per-cpu cpumask that | ||
1592 | * can be used from IRQ context. | ||
1593 | */ | ||
1594 | local_irq_disable(); | ||
1595 | env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, | ||
1596 | env->dst_cpu); | ||
1597 | local_irq_enable(); | ||
1598 | } | ||
1518 | 1599 | ||
1519 | assign: | 1600 | assign: |
1520 | task_numa_assign(env, cur, imp); | 1601 | task_numa_assign(env, cur, imp); |
@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work) | |||
2292 | unsigned long nr_pte_updates = 0; | 2373 | unsigned long nr_pte_updates = 0; |
2293 | long pages, virtpages; | 2374 | long pages, virtpages; |
2294 | 2375 | ||
2295 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2376 | SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); |
2296 | 2377 | ||
2297 | work->next = work; /* protect against double add */ | 2378 | work->next = work; /* protect against double add */ |
2298 | /* | 2379 | /* |
@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2803 | } | 2884 | } |
2804 | 2885 | ||
2805 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2886 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2806 | /* | 2887 | /** |
2807 | * Updating tg's load_avg is necessary before update_cfs_share (which is done) | 2888 | * update_tg_load_avg - update the tg's load avg |
2808 | * and effective_load (which is not done because it is too costly). | 2889 | * @cfs_rq: the cfs_rq whose avg changed |
2890 | * @force: update regardless of how small the difference | ||
2891 | * | ||
2892 | * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. | ||
2893 | * However, because tg->load_avg is a global value there are performance | ||
2894 | * considerations. | ||
2895 | * | ||
2896 | * In order to avoid having to look at the other cfs_rq's, we use a | ||
2897 | * differential update where we store the last value we propagated. This in | ||
2898 | * turn allows skipping updates if the differential is 'small'. | ||
2899 | * | ||
2900 | * Updating tg's load_avg is necessary before update_cfs_share() (which is | ||
2901 | * done) and effective_load() (which is not done because it is too costly). | ||
2809 | */ | 2902 | */ |
2810 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 2903 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
2811 | { | 2904 | { |
@@ -2875,12 +2968,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | |||
2875 | 2968 | ||
2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2969 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
2877 | { | 2970 | { |
2878 | struct rq *rq = rq_of(cfs_rq); | 2971 | if (&this_rq()->cfs == cfs_rq) { |
2879 | int cpu = cpu_of(rq); | ||
2880 | |||
2881 | if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { | ||
2882 | unsigned long max = rq->cpu_capacity_orig; | ||
2883 | |||
2884 | /* | 2972 | /* |
2885 | * There are a few boundary cases this might miss but it should | 2973 | * There are a few boundary cases this might miss but it should |
2886 | * get called often enough that that should (hopefully) not be | 2974 | * get called often enough that that should (hopefully) not be |
@@ -2897,8 +2985,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2897 | * | 2985 | * |
2898 | * See cpu_util(). | 2986 | * See cpu_util(). |
2899 | */ | 2987 | */ |
2900 | cpufreq_update_util(rq_clock(rq), | 2988 | cpufreq_update_util(rq_of(cfs_rq), 0); |
2901 | min(cfs_rq->avg.util_avg, max), max); | ||
2902 | } | 2989 | } |
2903 | } | 2990 | } |
2904 | 2991 | ||
@@ -2931,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2931 | * | 3018 | * |
2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | 3019 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. |
2933 | * | 3020 | * |
2934 | * Returns true if the load decayed or we removed utilization. It is expected | 3021 | * Returns true if the load decayed or we removed load. |
2935 | * that one calls update_tg_load_avg() on this condition, but after you've | 3022 | * |
2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | 3023 | * Since both these conditions indicate a changed cfs_rq->avg.load we should |
2937 | * avg up. | 3024 | * call update_tg_load_avg() when this function returns true. |
2938 | */ | 3025 | */ |
2939 | static inline int | 3026 | static inline int |
2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 3027 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
@@ -3159,10 +3246,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3159 | 3246 | ||
3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3247 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
3161 | { | 3248 | { |
3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3249 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); |
3163 | struct rq *rq = rq_of(cfs_rq); | ||
3164 | |||
3165 | cpufreq_trigger_update(rq_clock(rq)); | ||
3166 | } | 3250 | } |
3167 | 3251 | ||
3168 | static inline void | 3252 | static inline void |
@@ -3183,68 +3267,6 @@ static inline int idle_balance(struct rq *rq) | |||
3183 | 3267 | ||
3184 | #endif /* CONFIG_SMP */ | 3268 | #endif /* CONFIG_SMP */ |
3185 | 3269 | ||
3186 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3187 | { | ||
3188 | #ifdef CONFIG_SCHEDSTATS | ||
3189 | struct task_struct *tsk = NULL; | ||
3190 | |||
3191 | if (entity_is_task(se)) | ||
3192 | tsk = task_of(se); | ||
3193 | |||
3194 | if (se->statistics.sleep_start) { | ||
3195 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; | ||
3196 | |||
3197 | if ((s64)delta < 0) | ||
3198 | delta = 0; | ||
3199 | |||
3200 | if (unlikely(delta > se->statistics.sleep_max)) | ||
3201 | se->statistics.sleep_max = delta; | ||
3202 | |||
3203 | se->statistics.sleep_start = 0; | ||
3204 | se->statistics.sum_sleep_runtime += delta; | ||
3205 | |||
3206 | if (tsk) { | ||
3207 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
3208 | trace_sched_stat_sleep(tsk, delta); | ||
3209 | } | ||
3210 | } | ||
3211 | if (se->statistics.block_start) { | ||
3212 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; | ||
3213 | |||
3214 | if ((s64)delta < 0) | ||
3215 | delta = 0; | ||
3216 | |||
3217 | if (unlikely(delta > se->statistics.block_max)) | ||
3218 | se->statistics.block_max = delta; | ||
3219 | |||
3220 | se->statistics.block_start = 0; | ||
3221 | se->statistics.sum_sleep_runtime += delta; | ||
3222 | |||
3223 | if (tsk) { | ||
3224 | if (tsk->in_iowait) { | ||
3225 | se->statistics.iowait_sum += delta; | ||
3226 | se->statistics.iowait_count++; | ||
3227 | trace_sched_stat_iowait(tsk, delta); | ||
3228 | } | ||
3229 | |||
3230 | trace_sched_stat_blocked(tsk, delta); | ||
3231 | |||
3232 | /* | ||
3233 | * Blocking time is in units of nanosecs, so shift by | ||
3234 | * 20 to get a milliseconds-range estimation of the | ||
3235 | * amount of time that the task spent sleeping: | ||
3236 | */ | ||
3237 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
3238 | profile_hits(SLEEP_PROFILING, | ||
3239 | (void *)get_wchan(tsk), | ||
3240 | delta >> 20); | ||
3241 | } | ||
3242 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
3243 | } | ||
3244 | } | ||
3245 | #endif | ||
3246 | } | ||
3247 | |||
3248 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3270 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3249 | { | 3271 | { |
3250 | #ifdef CONFIG_SCHED_DEBUG | 3272 | #ifdef CONFIG_SCHED_DEBUG |
@@ -3254,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3254 | d = -d; | 3276 | d = -d; |
3255 | 3277 | ||
3256 | if (d > 3*sysctl_sched_latency) | 3278 | if (d > 3*sysctl_sched_latency) |
3257 | schedstat_inc(cfs_rq, nr_spread_over); | 3279 | schedstat_inc(cfs_rq->nr_spread_over); |
3258 | #endif | 3280 | #endif |
3259 | } | 3281 | } |
3260 | 3282 | ||
@@ -3371,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3371 | account_entity_enqueue(cfs_rq, se); | 3393 | account_entity_enqueue(cfs_rq, se); |
3372 | update_cfs_shares(cfs_rq); | 3394 | update_cfs_shares(cfs_rq); |
3373 | 3395 | ||
3374 | if (flags & ENQUEUE_WAKEUP) { | 3396 | if (flags & ENQUEUE_WAKEUP) |
3375 | place_entity(cfs_rq, se, 0); | 3397 | place_entity(cfs_rq, se, 0); |
3376 | if (schedstat_enabled()) | ||
3377 | enqueue_sleeper(cfs_rq, se); | ||
3378 | } | ||
3379 | 3398 | ||
3380 | check_schedstat_required(); | 3399 | check_schedstat_required(); |
3381 | if (schedstat_enabled()) { | 3400 | update_stats_enqueue(cfs_rq, se, flags); |
3382 | update_stats_enqueue(cfs_rq, se); | 3401 | check_spread(cfs_rq, se); |
3383 | check_spread(cfs_rq, se); | ||
3384 | } | ||
3385 | if (!curr) | 3402 | if (!curr) |
3386 | __enqueue_entity(cfs_rq, se); | 3403 | __enqueue_entity(cfs_rq, se); |
3387 | se->on_rq = 1; | 3404 | se->on_rq = 1; |
@@ -3448,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3448 | update_curr(cfs_rq); | 3465 | update_curr(cfs_rq); |
3449 | dequeue_entity_load_avg(cfs_rq, se); | 3466 | dequeue_entity_load_avg(cfs_rq, se); |
3450 | 3467 | ||
3451 | if (schedstat_enabled()) | 3468 | update_stats_dequeue(cfs_rq, se, flags); |
3452 | update_stats_dequeue(cfs_rq, se, flags); | ||
3453 | 3469 | ||
3454 | clear_buddies(cfs_rq, se); | 3470 | clear_buddies(cfs_rq, se); |
3455 | 3471 | ||
@@ -3459,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3459 | account_entity_dequeue(cfs_rq, se); | 3475 | account_entity_dequeue(cfs_rq, se); |
3460 | 3476 | ||
3461 | /* | 3477 | /* |
3462 | * Normalize the entity after updating the min_vruntime because the | 3478 | * Normalize after update_curr(); which will also have moved |
3463 | * update can refer to the ->curr item and we need to reflect this | 3479 | * min_vruntime if @se is the one holding it back. But before doing |
3464 | * movement in our normalized position. | 3480 | * update_min_vruntime() again, which will discount @se's position and |
3481 | * can move min_vruntime forward still more. | ||
3465 | */ | 3482 | */ |
3466 | if (!(flags & DEQUEUE_SLEEP)) | 3483 | if (!(flags & DEQUEUE_SLEEP)) |
3467 | se->vruntime -= cfs_rq->min_vruntime; | 3484 | se->vruntime -= cfs_rq->min_vruntime; |
@@ -3469,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3469 | /* return excess runtime on last dequeue */ | 3486 | /* return excess runtime on last dequeue */ |
3470 | return_cfs_rq_runtime(cfs_rq); | 3487 | return_cfs_rq_runtime(cfs_rq); |
3471 | 3488 | ||
3472 | update_min_vruntime(cfs_rq); | ||
3473 | update_cfs_shares(cfs_rq); | 3489 | update_cfs_shares(cfs_rq); |
3490 | |||
3491 | /* | ||
3492 | * Now advance min_vruntime if @se was the entity holding it back, | ||
3493 | * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be | ||
3494 | * put back on, and if we advance min_vruntime, we'll be placed back | ||
3495 | * further than we started -- ie. we'll be penalized. | ||
3496 | */ | ||
3497 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
3498 | update_min_vruntime(cfs_rq); | ||
3474 | } | 3499 | } |
3475 | 3500 | ||
3476 | /* | 3501 | /* |
@@ -3523,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3523 | * a CPU. So account for the time it spent waiting on the | 3548 | * a CPU. So account for the time it spent waiting on the |
3524 | * runqueue. | 3549 | * runqueue. |
3525 | */ | 3550 | */ |
3526 | if (schedstat_enabled()) | 3551 | update_stats_wait_end(cfs_rq, se); |
3527 | update_stats_wait_end(cfs_rq, se); | ||
3528 | __dequeue_entity(cfs_rq, se); | 3552 | __dequeue_entity(cfs_rq, se); |
3529 | update_load_avg(se, 1); | 3553 | update_load_avg(se, 1); |
3530 | } | 3554 | } |
3531 | 3555 | ||
3532 | update_stats_curr_start(cfs_rq, se); | 3556 | update_stats_curr_start(cfs_rq, se); |
3533 | cfs_rq->curr = se; | 3557 | cfs_rq->curr = se; |
3534 | #ifdef CONFIG_SCHEDSTATS | 3558 | |
3535 | /* | 3559 | /* |
3536 | * Track our maximum slice length, if the CPU's load is at | 3560 | * Track our maximum slice length, if the CPU's load is at |
3537 | * least twice that of our own weight (i.e. dont track it | 3561 | * least twice that of our own weight (i.e. dont track it |
3538 | * when there are only lesser-weight tasks around): | 3562 | * when there are only lesser-weight tasks around): |
3539 | */ | 3563 | */ |
3540 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3564 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
3541 | se->statistics.slice_max = max(se->statistics.slice_max, | 3565 | schedstat_set(se->statistics.slice_max, |
3542 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3566 | max((u64)schedstat_val(se->statistics.slice_max), |
3567 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | ||
3543 | } | 3568 | } |
3544 | #endif | 3569 | |
3545 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 3570 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
3546 | } | 3571 | } |
3547 | 3572 | ||
@@ -3620,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3620 | /* throttle cfs_rqs exceeding runtime */ | 3645 | /* throttle cfs_rqs exceeding runtime */ |
3621 | check_cfs_rq_runtime(cfs_rq); | 3646 | check_cfs_rq_runtime(cfs_rq); |
3622 | 3647 | ||
3623 | if (schedstat_enabled()) { | 3648 | check_spread(cfs_rq, prev); |
3624 | check_spread(cfs_rq, prev); | ||
3625 | if (prev->on_rq) | ||
3626 | update_stats_wait_start(cfs_rq, prev); | ||
3627 | } | ||
3628 | 3649 | ||
3629 | if (prev->on_rq) { | 3650 | if (prev->on_rq) { |
3651 | update_stats_wait_start(cfs_rq, prev); | ||
3630 | /* Put 'current' back into the tree. */ | 3652 | /* Put 'current' back into the tree. */ |
3631 | __enqueue_entity(cfs_rq, prev); | 3653 | __enqueue_entity(cfs_rq, prev); |
3632 | /* in !on_rq case, update occurred at dequeue */ | 3654 | /* in !on_rq case, update occurred at dequeue */ |
@@ -4456,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
4456 | struct sched_entity *se = &p->se; | 4478 | struct sched_entity *se = &p->se; |
4457 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 4479 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4458 | 4480 | ||
4459 | WARN_ON(task_rq(p) != rq); | 4481 | SCHED_WARN_ON(task_rq(p) != rq); |
4460 | 4482 | ||
4461 | if (cfs_rq->nr_running > 1) { | 4483 | if (rq->cfs.h_nr_running > 1) { |
4462 | u64 slice = sched_slice(cfs_rq, se); | 4484 | u64 slice = sched_slice(cfs_rq, se); |
4463 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 4485 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
4464 | s64 delta = slice - ran; | 4486 | s64 delta = slice - ran; |
@@ -4509,6 +4531,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4509 | struct cfs_rq *cfs_rq; | 4531 | struct cfs_rq *cfs_rq; |
4510 | struct sched_entity *se = &p->se; | 4532 | struct sched_entity *se = &p->se; |
4511 | 4533 | ||
4534 | /* | ||
4535 | * If in_iowait is set, the code below may not trigger any cpufreq | ||
4536 | * utilization updates, so do it here explicitly with the IOWAIT flag | ||
4537 | * passed. | ||
4538 | */ | ||
4539 | if (p->in_iowait) | ||
4540 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); | ||
4541 | |||
4512 | for_each_sched_entity(se) { | 4542 | for_each_sched_entity(se) { |
4513 | if (se->on_rq) | 4543 | if (se->on_rq) |
4514 | break; | 4544 | break; |
@@ -4605,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4605 | } | 4635 | } |
4606 | 4636 | ||
4607 | #ifdef CONFIG_SMP | 4637 | #ifdef CONFIG_SMP |
4638 | |||
4639 | /* Working cpumask for: load_balance, load_balance_newidle. */ | ||
4640 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
4641 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
4642 | |||
4608 | #ifdef CONFIG_NO_HZ_COMMON | 4643 | #ifdef CONFIG_NO_HZ_COMMON |
4609 | /* | 4644 | /* |
4610 | * per rq 'load' arrray crap; XXX kill this. | 4645 | * per rq 'load' arrray crap; XXX kill this. |
@@ -5006,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
5006 | * wl = S * s'_i; see (2) | 5041 | * wl = S * s'_i; see (2) |
5007 | */ | 5042 | */ |
5008 | if (W > 0 && w < W) | 5043 | if (W > 0 && w < W) |
5009 | wl = (w * (long)tg->shares) / W; | 5044 | wl = (w * (long)scale_load_down(tg->shares)) / W; |
5010 | else | 5045 | else |
5011 | wl = tg->shares; | 5046 | wl = scale_load_down(tg->shares); |
5012 | 5047 | ||
5013 | /* | 5048 | /* |
5014 | * Per the above, wl is the new se->load.weight value; since | 5049 | * Per the above, wl is the new se->load.weight value; since |
@@ -5091,18 +5126,18 @@ static int wake_wide(struct task_struct *p) | |||
5091 | return 1; | 5126 | return 1; |
5092 | } | 5127 | } |
5093 | 5128 | ||
5094 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 5129 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5130 | int prev_cpu, int sync) | ||
5095 | { | 5131 | { |
5096 | s64 this_load, load; | 5132 | s64 this_load, load; |
5097 | s64 this_eff_load, prev_eff_load; | 5133 | s64 this_eff_load, prev_eff_load; |
5098 | int idx, this_cpu, prev_cpu; | 5134 | int idx, this_cpu; |
5099 | struct task_group *tg; | 5135 | struct task_group *tg; |
5100 | unsigned long weight; | 5136 | unsigned long weight; |
5101 | int balanced; | 5137 | int balanced; |
5102 | 5138 | ||
5103 | idx = sd->wake_idx; | 5139 | idx = sd->wake_idx; |
5104 | this_cpu = smp_processor_id(); | 5140 | this_cpu = smp_processor_id(); |
5105 | prev_cpu = task_cpu(p); | ||
5106 | load = source_load(prev_cpu, idx); | 5141 | load = source_load(prev_cpu, idx); |
5107 | this_load = target_load(this_cpu, idx); | 5142 | this_load = target_load(this_cpu, idx); |
5108 | 5143 | ||
@@ -5146,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
5146 | 5181 | ||
5147 | balanced = this_eff_load <= prev_eff_load; | 5182 | balanced = this_eff_load <= prev_eff_load; |
5148 | 5183 | ||
5149 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 5184 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5150 | 5185 | ||
5151 | if (!balanced) | 5186 | if (!balanced) |
5152 | return 0; | 5187 | return 0; |
5153 | 5188 | ||
5154 | schedstat_inc(sd, ttwu_move_affine); | 5189 | schedstat_inc(sd->ttwu_move_affine); |
5155 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | 5190 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5156 | 5191 | ||
5157 | return 1; | 5192 | return 1; |
5158 | } | 5193 | } |
@@ -5228,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5228 | int shallowest_idle_cpu = -1; | 5263 | int shallowest_idle_cpu = -1; |
5229 | int i; | 5264 | int i; |
5230 | 5265 | ||
5266 | /* Check if we have any choice: */ | ||
5267 | if (group->group_weight == 1) | ||
5268 | return cpumask_first(sched_group_cpus(group)); | ||
5269 | |||
5231 | /* Traverse only the allowed CPUs */ | 5270 | /* Traverse only the allowed CPUs */ |
5232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 5271 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
5233 | if (idle_cpu(i)) { | 5272 | if (idle_cpu(i)) { |
@@ -5265,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5265 | } | 5304 | } |
5266 | 5305 | ||
5267 | /* | 5306 | /* |
5268 | * Try and locate an idle CPU in the sched_domain. | 5307 | * Implement a for_each_cpu() variant that starts the scan at a given cpu |
5308 | * (@start), and wraps around. | ||
5309 | * | ||
5310 | * This is used to scan for idle CPUs; such that not all CPUs looking for an | ||
5311 | * idle CPU find the same CPU. The down-side is that tasks tend to cycle | ||
5312 | * through the LLC domain. | ||
5313 | * | ||
5314 | * Especially tbench is found sensitive to this. | ||
5315 | */ | ||
5316 | |||
5317 | static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) | ||
5318 | { | ||
5319 | int next; | ||
5320 | |||
5321 | again: | ||
5322 | next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); | ||
5323 | |||
5324 | if (*wrapped) { | ||
5325 | if (next >= start) | ||
5326 | return nr_cpumask_bits; | ||
5327 | } else { | ||
5328 | if (next >= nr_cpumask_bits) { | ||
5329 | *wrapped = 1; | ||
5330 | n = -1; | ||
5331 | goto again; | ||
5332 | } | ||
5333 | } | ||
5334 | |||
5335 | return next; | ||
5336 | } | ||
5337 | |||
5338 | #define for_each_cpu_wrap(cpu, mask, start, wrap) \ | ||
5339 | for ((wrap) = 0, (cpu) = (start)-1; \ | ||
5340 | (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ | ||
5341 | (cpu) < nr_cpumask_bits; ) | ||
5342 | |||
5343 | #ifdef CONFIG_SCHED_SMT | ||
5344 | |||
5345 | static inline void set_idle_cores(int cpu, int val) | ||
5346 | { | ||
5347 | struct sched_domain_shared *sds; | ||
5348 | |||
5349 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5350 | if (sds) | ||
5351 | WRITE_ONCE(sds->has_idle_cores, val); | ||
5352 | } | ||
5353 | |||
5354 | static inline bool test_idle_cores(int cpu, bool def) | ||
5355 | { | ||
5356 | struct sched_domain_shared *sds; | ||
5357 | |||
5358 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
5359 | if (sds) | ||
5360 | return READ_ONCE(sds->has_idle_cores); | ||
5361 | |||
5362 | return def; | ||
5363 | } | ||
5364 | |||
5365 | /* | ||
5366 | * Scans the local SMT mask to see if the entire core is idle, and records this | ||
5367 | * information in sd_llc_shared->has_idle_cores. | ||
5368 | * | ||
5369 | * Since SMT siblings share all cache levels, inspecting this limited remote | ||
5370 | * state should be fairly cheap. | ||
5371 | */ | ||
5372 | void __update_idle_core(struct rq *rq) | ||
5373 | { | ||
5374 | int core = cpu_of(rq); | ||
5375 | int cpu; | ||
5376 | |||
5377 | rcu_read_lock(); | ||
5378 | if (test_idle_cores(core, true)) | ||
5379 | goto unlock; | ||
5380 | |||
5381 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5382 | if (cpu == core) | ||
5383 | continue; | ||
5384 | |||
5385 | if (!idle_cpu(cpu)) | ||
5386 | goto unlock; | ||
5387 | } | ||
5388 | |||
5389 | set_idle_cores(core, 1); | ||
5390 | unlock: | ||
5391 | rcu_read_unlock(); | ||
5392 | } | ||
5393 | |||
5394 | /* | ||
5395 | * Scan the entire LLC domain for idle cores; this dynamically switches off if | ||
5396 | * there are no idle cores left in the system; tracked through | ||
5397 | * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. | ||
5398 | */ | ||
5399 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5400 | { | ||
5401 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); | ||
5402 | int core, cpu, wrap; | ||
5403 | |||
5404 | if (!static_branch_likely(&sched_smt_present)) | ||
5405 | return -1; | ||
5406 | |||
5407 | if (!test_idle_cores(target, false)) | ||
5408 | return -1; | ||
5409 | |||
5410 | cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); | ||
5411 | |||
5412 | for_each_cpu_wrap(core, cpus, target, wrap) { | ||
5413 | bool idle = true; | ||
5414 | |||
5415 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
5416 | cpumask_clear_cpu(cpu, cpus); | ||
5417 | if (!idle_cpu(cpu)) | ||
5418 | idle = false; | ||
5419 | } | ||
5420 | |||
5421 | if (idle) | ||
5422 | return core; | ||
5423 | } | ||
5424 | |||
5425 | /* | ||
5426 | * Failed to find an idle core; stop looking for one. | ||
5427 | */ | ||
5428 | set_idle_cores(target, 0); | ||
5429 | |||
5430 | return -1; | ||
5431 | } | ||
5432 | |||
5433 | /* | ||
5434 | * Scan the local SMT mask for idle CPUs. | ||
5269 | */ | 5435 | */ |
5270 | static int select_idle_sibling(struct task_struct *p, int target) | 5436 | static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) |
5437 | { | ||
5438 | int cpu; | ||
5439 | |||
5440 | if (!static_branch_likely(&sched_smt_present)) | ||
5441 | return -1; | ||
5442 | |||
5443 | for_each_cpu(cpu, cpu_smt_mask(target)) { | ||
5444 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5445 | continue; | ||
5446 | if (idle_cpu(cpu)) | ||
5447 | return cpu; | ||
5448 | } | ||
5449 | |||
5450 | return -1; | ||
5451 | } | ||
5452 | |||
5453 | #else /* CONFIG_SCHED_SMT */ | ||
5454 | |||
5455 | static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
5456 | { | ||
5457 | return -1; | ||
5458 | } | ||
5459 | |||
5460 | static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
5461 | { | ||
5462 | return -1; | ||
5463 | } | ||
5464 | |||
5465 | #endif /* CONFIG_SCHED_SMT */ | ||
5466 | |||
5467 | /* | ||
5468 | * Scan the LLC domain for idle CPUs; this is dynamically regulated by | ||
5469 | * comparing the average scan cost (tracked in sd->avg_scan_cost) against the | ||
5470 | * average idle time for this rq (as found in rq->avg_idle). | ||
5471 | */ | ||
5472 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) | ||
5473 | { | ||
5474 | struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | ||
5475 | u64 avg_idle = this_rq()->avg_idle; | ||
5476 | u64 avg_cost = this_sd->avg_scan_cost; | ||
5477 | u64 time, cost; | ||
5478 | s64 delta; | ||
5479 | int cpu, wrap; | ||
5480 | |||
5481 | /* | ||
5482 | * Due to large variance we need a large fuzz factor; hackbench in | ||
5483 | * particularly is sensitive here. | ||
5484 | */ | ||
5485 | if ((avg_idle / 512) < avg_cost) | ||
5486 | return -1; | ||
5487 | |||
5488 | time = local_clock(); | ||
5489 | |||
5490 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { | ||
5491 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
5492 | continue; | ||
5493 | if (idle_cpu(cpu)) | ||
5494 | break; | ||
5495 | } | ||
5496 | |||
5497 | time = local_clock() - time; | ||
5498 | cost = this_sd->avg_scan_cost; | ||
5499 | delta = (s64)(time - cost) / 8; | ||
5500 | this_sd->avg_scan_cost += delta; | ||
5501 | |||
5502 | return cpu; | ||
5503 | } | ||
5504 | |||
5505 | /* | ||
5506 | * Try and locate an idle core/thread in the LLC cache domain. | ||
5507 | */ | ||
5508 | static int select_idle_sibling(struct task_struct *p, int prev, int target) | ||
5271 | { | 5509 | { |
5272 | struct sched_domain *sd; | 5510 | struct sched_domain *sd; |
5273 | struct sched_group *sg; | 5511 | int i; |
5274 | int i = task_cpu(p); | ||
5275 | 5512 | ||
5276 | if (idle_cpu(target)) | 5513 | if (idle_cpu(target)) |
5277 | return target; | 5514 | return target; |
5278 | 5515 | ||
5279 | /* | 5516 | /* |
5280 | * If the prevous cpu is cache affine and idle, don't be stupid. | 5517 | * If the previous cpu is cache affine and idle, don't be stupid. |
5281 | */ | 5518 | */ |
5282 | if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) | 5519 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
5283 | return i; | 5520 | return prev; |
5284 | 5521 | ||
5285 | /* | ||
5286 | * Otherwise, iterate the domains and find an eligible idle cpu. | ||
5287 | * | ||
5288 | * A completely idle sched group at higher domains is more | ||
5289 | * desirable than an idle group at a lower level, because lower | ||
5290 | * domains have smaller groups and usually share hardware | ||
5291 | * resources which causes tasks to contend on them, e.g. x86 | ||
5292 | * hyperthread siblings in the lowest domain (SMT) can contend | ||
5293 | * on the shared cpu pipeline. | ||
5294 | * | ||
5295 | * However, while we prefer idle groups at higher domains | ||
5296 | * finding an idle cpu at the lowest domain is still better than | ||
5297 | * returning 'target', which we've already established, isn't | ||
5298 | * idle. | ||
5299 | */ | ||
5300 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 5522 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
5301 | for_each_lower_domain(sd) { | 5523 | if (!sd) |
5302 | sg = sd->groups; | 5524 | return target; |
5303 | do { | 5525 | |
5304 | if (!cpumask_intersects(sched_group_cpus(sg), | 5526 | i = select_idle_core(p, sd, target); |
5305 | tsk_cpus_allowed(p))) | 5527 | if ((unsigned)i < nr_cpumask_bits) |
5306 | goto next; | 5528 | return i; |
5307 | 5529 | ||
5308 | /* Ensure the entire group is idle */ | 5530 | i = select_idle_cpu(p, sd, target); |
5309 | for_each_cpu(i, sched_group_cpus(sg)) { | 5531 | if ((unsigned)i < nr_cpumask_bits) |
5310 | if (i == target || !idle_cpu(i)) | 5532 | return i; |
5311 | goto next; | 5533 | |
5312 | } | 5534 | i = select_idle_smt(p, sd, target); |
5535 | if ((unsigned)i < nr_cpumask_bits) | ||
5536 | return i; | ||
5313 | 5537 | ||
5314 | /* | ||
5315 | * It doesn't matter which cpu we pick, the | ||
5316 | * whole group is idle. | ||
5317 | */ | ||
5318 | target = cpumask_first_and(sched_group_cpus(sg), | ||
5319 | tsk_cpus_allowed(p)); | ||
5320 | goto done; | ||
5321 | next: | ||
5322 | sg = sg->next; | ||
5323 | } while (sg != sd->groups); | ||
5324 | } | ||
5325 | done: | ||
5326 | return target; | 5538 | return target; |
5327 | } | 5539 | } |
5328 | 5540 | ||
@@ -5360,6 +5572,32 @@ static int cpu_util(int cpu) | |||
5360 | return (util >= capacity) ? capacity : util; | 5572 | return (util >= capacity) ? capacity : util; |
5361 | } | 5573 | } |
5362 | 5574 | ||
5575 | static inline int task_util(struct task_struct *p) | ||
5576 | { | ||
5577 | return p->se.avg.util_avg; | ||
5578 | } | ||
5579 | |||
5580 | /* | ||
5581 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the | ||
5582 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. | ||
5583 | * | ||
5584 | * In that case WAKE_AFFINE doesn't make sense and we'll let | ||
5585 | * BALANCE_WAKE sort things out. | ||
5586 | */ | ||
5587 | static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | ||
5588 | { | ||
5589 | long min_cap, max_cap; | ||
5590 | |||
5591 | min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); | ||
5592 | max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; | ||
5593 | |||
5594 | /* Minimum capacity is close to max, no need to abort wake_affine */ | ||
5595 | if (max_cap - min_cap < max_cap >> 3) | ||
5596 | return 0; | ||
5597 | |||
5598 | return min_cap * 1024 < task_util(p) * capacity_margin; | ||
5599 | } | ||
5600 | |||
5363 | /* | 5601 | /* |
5364 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 5602 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
5365 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 5603 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
@@ -5383,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5383 | 5621 | ||
5384 | if (sd_flag & SD_BALANCE_WAKE) { | 5622 | if (sd_flag & SD_BALANCE_WAKE) { |
5385 | record_wakee(p); | 5623 | record_wakee(p); |
5386 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 5624 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) |
5625 | && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | ||
5387 | } | 5626 | } |
5388 | 5627 | ||
5389 | rcu_read_lock(); | 5628 | rcu_read_lock(); |
@@ -5409,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5409 | 5648 | ||
5410 | if (affine_sd) { | 5649 | if (affine_sd) { |
5411 | sd = NULL; /* Prefer wake_affine over balance flags */ | 5650 | sd = NULL; /* Prefer wake_affine over balance flags */ |
5412 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 5651 | if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) |
5413 | new_cpu = cpu; | 5652 | new_cpu = cpu; |
5414 | } | 5653 | } |
5415 | 5654 | ||
5416 | if (!sd) { | 5655 | if (!sd) { |
5417 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 5656 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
5418 | new_cpu = select_idle_sibling(p, new_cpu); | 5657 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
5419 | 5658 | ||
5420 | } else while (sd) { | 5659 | } else while (sd) { |
5421 | struct sched_group *group; | 5660 | struct sched_group *group; |
@@ -5939,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5939 | * | 6178 | * |
5940 | * The adjacency matrix of the resulting graph is given by: | 6179 | * The adjacency matrix of the resulting graph is given by: |
5941 | * | 6180 | * |
5942 | * log_2 n | 6181 | * log_2 n |
5943 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | 6182 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) |
5944 | * k = 0 | 6183 | * k = 0 |
5945 | * | 6184 | * |
@@ -5985,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
5985 | * | 6224 | * |
5986 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | 6225 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that |
5987 | * rewrite all of this once again.] | 6226 | * rewrite all of this once again.] |
5988 | */ | 6227 | */ |
5989 | 6228 | ||
5990 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 6229 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
5991 | 6230 | ||
@@ -6133,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6133 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 6372 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
6134 | int cpu; | 6373 | int cpu; |
6135 | 6374 | ||
6136 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 6375 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
6137 | 6376 | ||
6138 | env->flags |= LBF_SOME_PINNED; | 6377 | env->flags |= LBF_SOME_PINNED; |
6139 | 6378 | ||
@@ -6164,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6164 | env->flags &= ~LBF_ALL_PINNED; | 6403 | env->flags &= ~LBF_ALL_PINNED; |
6165 | 6404 | ||
6166 | if (task_running(env->src_rq, p)) { | 6405 | if (task_running(env->src_rq, p)) { |
6167 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 6406 | schedstat_inc(p->se.statistics.nr_failed_migrations_running); |
6168 | return 0; | 6407 | return 0; |
6169 | } | 6408 | } |
6170 | 6409 | ||
@@ -6181,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
6181 | if (tsk_cache_hot <= 0 || | 6420 | if (tsk_cache_hot <= 0 || |
6182 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 6421 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
6183 | if (tsk_cache_hot == 1) { | 6422 | if (tsk_cache_hot == 1) { |
6184 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 6423 | schedstat_inc(env->sd->lb_hot_gained[env->idle]); |
6185 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 6424 | schedstat_inc(p->se.statistics.nr_forced_migrations); |
6186 | } | 6425 | } |
6187 | return 1; | 6426 | return 1; |
6188 | } | 6427 | } |
6189 | 6428 | ||
6190 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 6429 | schedstat_inc(p->se.statistics.nr_failed_migrations_hot); |
6191 | return 0; | 6430 | return 0; |
6192 | } | 6431 | } |
6193 | 6432 | ||
@@ -6227,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
6227 | * so we can safely collect stats here rather than | 6466 | * so we can safely collect stats here rather than |
6228 | * inside detach_tasks(). | 6467 | * inside detach_tasks(). |
6229 | */ | 6468 | */ |
6230 | schedstat_inc(env->sd, lb_gained[env->idle]); | 6469 | schedstat_inc(env->sd->lb_gained[env->idle]); |
6231 | return p; | 6470 | return p; |
6232 | } | 6471 | } |
6233 | return NULL; | 6472 | return NULL; |
@@ -6319,7 +6558,7 @@ next: | |||
6319 | * so we can safely collect detach_one_task() stats here rather | 6558 | * so we can safely collect detach_one_task() stats here rather |
6320 | * than inside detach_one_task(). | 6559 | * than inside detach_one_task(). |
6321 | */ | 6560 | */ |
6322 | schedstat_add(env->sd, lb_gained[env->idle], detached); | 6561 | schedstat_add(env->sd->lb_gained[env->idle], detached); |
6323 | 6562 | ||
6324 | return detached; | 6563 | return detached; |
6325 | } | 6564 | } |
@@ -6647,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6647 | /* | 6886 | /* |
6648 | * !SD_OVERLAP domains can assume that child groups | 6887 | * !SD_OVERLAP domains can assume that child groups |
6649 | * span the current group. | 6888 | * span the current group. |
6650 | */ | 6889 | */ |
6651 | 6890 | ||
6652 | group = child->groups; | 6891 | group = child->groups; |
6653 | do { | 6892 | do { |
@@ -7147,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
7147 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; | 7386 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; |
7148 | if (load_above_capacity > busiest->group_capacity) { | 7387 | if (load_above_capacity > busiest->group_capacity) { |
7149 | load_above_capacity -= busiest->group_capacity; | 7388 | load_above_capacity -= busiest->group_capacity; |
7150 | load_above_capacity *= NICE_0_LOAD; | 7389 | load_above_capacity *= scale_load_down(NICE_0_LOAD); |
7151 | load_above_capacity /= busiest->group_capacity; | 7390 | load_above_capacity /= busiest->group_capacity; |
7152 | } else | 7391 | } else |
7153 | load_above_capacity = ~0UL; | 7392 | load_above_capacity = ~0UL; |
@@ -7354,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
7354 | */ | 7593 | */ |
7355 | #define MAX_PINNED_INTERVAL 512 | 7594 | #define MAX_PINNED_INTERVAL 512 |
7356 | 7595 | ||
7357 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
7358 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
7359 | |||
7360 | static int need_active_balance(struct lb_env *env) | 7596 | static int need_active_balance(struct lb_env *env) |
7361 | { | 7597 | { |
7362 | struct sched_domain *sd = env->sd; | 7598 | struct sched_domain *sd = env->sd; |
@@ -7460,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
7460 | 7696 | ||
7461 | cpumask_copy(cpus, cpu_active_mask); | 7697 | cpumask_copy(cpus, cpu_active_mask); |
7462 | 7698 | ||
7463 | schedstat_inc(sd, lb_count[idle]); | 7699 | schedstat_inc(sd->lb_count[idle]); |
7464 | 7700 | ||
7465 | redo: | 7701 | redo: |
7466 | if (!should_we_balance(&env)) { | 7702 | if (!should_we_balance(&env)) { |
@@ -7470,19 +7706,19 @@ redo: | |||
7470 | 7706 | ||
7471 | group = find_busiest_group(&env); | 7707 | group = find_busiest_group(&env); |
7472 | if (!group) { | 7708 | if (!group) { |
7473 | schedstat_inc(sd, lb_nobusyg[idle]); | 7709 | schedstat_inc(sd->lb_nobusyg[idle]); |
7474 | goto out_balanced; | 7710 | goto out_balanced; |
7475 | } | 7711 | } |
7476 | 7712 | ||
7477 | busiest = find_busiest_queue(&env, group); | 7713 | busiest = find_busiest_queue(&env, group); |
7478 | if (!busiest) { | 7714 | if (!busiest) { |
7479 | schedstat_inc(sd, lb_nobusyq[idle]); | 7715 | schedstat_inc(sd->lb_nobusyq[idle]); |
7480 | goto out_balanced; | 7716 | goto out_balanced; |
7481 | } | 7717 | } |
7482 | 7718 | ||
7483 | BUG_ON(busiest == env.dst_rq); | 7719 | BUG_ON(busiest == env.dst_rq); |
7484 | 7720 | ||
7485 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 7721 | schedstat_add(sd->lb_imbalance[idle], env.imbalance); |
7486 | 7722 | ||
7487 | env.src_cpu = busiest->cpu; | 7723 | env.src_cpu = busiest->cpu; |
7488 | env.src_rq = busiest; | 7724 | env.src_rq = busiest; |
@@ -7589,7 +7825,7 @@ more_balance: | |||
7589 | } | 7825 | } |
7590 | 7826 | ||
7591 | if (!ld_moved) { | 7827 | if (!ld_moved) { |
7592 | schedstat_inc(sd, lb_failed[idle]); | 7828 | schedstat_inc(sd->lb_failed[idle]); |
7593 | /* | 7829 | /* |
7594 | * Increment the failure counter only on periodic balance. | 7830 | * Increment the failure counter only on periodic balance. |
7595 | * We do not want newidle balance, which can be very | 7831 | * We do not want newidle balance, which can be very |
@@ -7672,7 +7908,7 @@ out_all_pinned: | |||
7672 | * we can't migrate them. Let the imbalance flag set so parent level | 7908 | * we can't migrate them. Let the imbalance flag set so parent level |
7673 | * can try to migrate them. | 7909 | * can try to migrate them. |
7674 | */ | 7910 | */ |
7675 | schedstat_inc(sd, lb_balanced[idle]); | 7911 | schedstat_inc(sd->lb_balanced[idle]); |
7676 | 7912 | ||
7677 | sd->nr_balance_failed = 0; | 7913 | sd->nr_balance_failed = 0; |
7678 | 7914 | ||
@@ -7704,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | |||
7704 | } | 7940 | } |
7705 | 7941 | ||
7706 | static inline void | 7942 | static inline void |
7707 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | 7943 | update_next_balance(struct sched_domain *sd, unsigned long *next_balance) |
7708 | { | 7944 | { |
7709 | unsigned long interval, next; | 7945 | unsigned long interval, next; |
7710 | 7946 | ||
7711 | interval = get_sd_balance_interval(sd, cpu_busy); | 7947 | /* used by idle balance, so cpu_busy = 0 */ |
7948 | interval = get_sd_balance_interval(sd, 0); | ||
7712 | next = sd->last_balance + interval; | 7949 | next = sd->last_balance + interval; |
7713 | 7950 | ||
7714 | if (time_after(*next_balance, next)) | 7951 | if (time_after(*next_balance, next)) |
@@ -7738,7 +7975,7 @@ static int idle_balance(struct rq *this_rq) | |||
7738 | rcu_read_lock(); | 7975 | rcu_read_lock(); |
7739 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 7976 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
7740 | if (sd) | 7977 | if (sd) |
7741 | update_next_balance(sd, 0, &next_balance); | 7978 | update_next_balance(sd, &next_balance); |
7742 | rcu_read_unlock(); | 7979 | rcu_read_unlock(); |
7743 | 7980 | ||
7744 | goto out; | 7981 | goto out; |
@@ -7756,7 +7993,7 @@ static int idle_balance(struct rq *this_rq) | |||
7756 | continue; | 7993 | continue; |
7757 | 7994 | ||
7758 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | 7995 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
7759 | update_next_balance(sd, 0, &next_balance); | 7996 | update_next_balance(sd, &next_balance); |
7760 | break; | 7997 | break; |
7761 | } | 7998 | } |
7762 | 7999 | ||
@@ -7774,7 +8011,7 @@ static int idle_balance(struct rq *this_rq) | |||
7774 | curr_cost += domain_cost; | 8011 | curr_cost += domain_cost; |
7775 | } | 8012 | } |
7776 | 8013 | ||
7777 | update_next_balance(sd, 0, &next_balance); | 8014 | update_next_balance(sd, &next_balance); |
7778 | 8015 | ||
7779 | /* | 8016 | /* |
7780 | * Stop searching for tasks to pull if there are | 8017 | * Stop searching for tasks to pull if there are |
@@ -7864,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data) | |||
7864 | .idle = CPU_IDLE, | 8101 | .idle = CPU_IDLE, |
7865 | }; | 8102 | }; |
7866 | 8103 | ||
7867 | schedstat_inc(sd, alb_count); | 8104 | schedstat_inc(sd->alb_count); |
7868 | 8105 | ||
7869 | p = detach_one_task(&env); | 8106 | p = detach_one_task(&env); |
7870 | if (p) { | 8107 | if (p) { |
7871 | schedstat_inc(sd, alb_pushed); | 8108 | schedstat_inc(sd->alb_pushed); |
7872 | /* Active balancing done, reset the failure counter. */ | 8109 | /* Active balancing done, reset the failure counter. */ |
7873 | sd->nr_balance_failed = 0; | 8110 | sd->nr_balance_failed = 0; |
7874 | } else { | 8111 | } else { |
7875 | schedstat_inc(sd, alb_failed); | 8112 | schedstat_inc(sd->alb_failed); |
7876 | } | 8113 | } |
7877 | } | 8114 | } |
7878 | rcu_read_unlock(); | 8115 | rcu_read_unlock(); |
@@ -7964,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void) | |||
7964 | int cpu = smp_processor_id(); | 8201 | int cpu = smp_processor_id(); |
7965 | 8202 | ||
7966 | rcu_read_lock(); | 8203 | rcu_read_lock(); |
7967 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8204 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7968 | 8205 | ||
7969 | if (!sd || !sd->nohz_idle) | 8206 | if (!sd || !sd->nohz_idle) |
7970 | goto unlock; | 8207 | goto unlock; |
7971 | sd->nohz_idle = 0; | 8208 | sd->nohz_idle = 0; |
7972 | 8209 | ||
7973 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); | 8210 | atomic_inc(&sd->shared->nr_busy_cpus); |
7974 | unlock: | 8211 | unlock: |
7975 | rcu_read_unlock(); | 8212 | rcu_read_unlock(); |
7976 | } | 8213 | } |
@@ -7981,13 +8218,13 @@ void set_cpu_sd_state_idle(void) | |||
7981 | int cpu = smp_processor_id(); | 8218 | int cpu = smp_processor_id(); |
7982 | 8219 | ||
7983 | rcu_read_lock(); | 8220 | rcu_read_lock(); |
7984 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8221 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
7985 | 8222 | ||
7986 | if (!sd || sd->nohz_idle) | 8223 | if (!sd || sd->nohz_idle) |
7987 | goto unlock; | 8224 | goto unlock; |
7988 | sd->nohz_idle = 1; | 8225 | sd->nohz_idle = 1; |
7989 | 8226 | ||
7990 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); | 8227 | atomic_dec(&sd->shared->nr_busy_cpus); |
7991 | unlock: | 8228 | unlock: |
7992 | rcu_read_unlock(); | 8229 | rcu_read_unlock(); |
7993 | } | 8230 | } |
@@ -8214,8 +8451,8 @@ end: | |||
8214 | static inline bool nohz_kick_needed(struct rq *rq) | 8451 | static inline bool nohz_kick_needed(struct rq *rq) |
8215 | { | 8452 | { |
8216 | unsigned long now = jiffies; | 8453 | unsigned long now = jiffies; |
8454 | struct sched_domain_shared *sds; | ||
8217 | struct sched_domain *sd; | 8455 | struct sched_domain *sd; |
8218 | struct sched_group_capacity *sgc; | ||
8219 | int nr_busy, cpu = rq->cpu; | 8456 | int nr_busy, cpu = rq->cpu; |
8220 | bool kick = false; | 8457 | bool kick = false; |
8221 | 8458 | ||
@@ -8243,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
8243 | return true; | 8480 | return true; |
8244 | 8481 | ||
8245 | rcu_read_lock(); | 8482 | rcu_read_lock(); |
8246 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8483 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
8247 | if (sd) { | 8484 | if (sds) { |
8248 | sgc = sd->groups->sgc; | 8485 | /* |
8249 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 8486 | * XXX: write a coherent comment on why we do this. |
8250 | 8487 | * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com | |
8488 | */ | ||
8489 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
8251 | if (nr_busy > 1) { | 8490 | if (nr_busy > 1) { |
8252 | kick = true; | 8491 | kick = true; |
8253 | goto unlock; | 8492 | goto unlock; |
@@ -8441,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8441 | struct sched_entity *se = &p->se; | 8680 | struct sched_entity *se = &p->se; |
8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8681 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8443 | u64 now = cfs_rq_clock_task(cfs_rq); | 8682 | u64 now = cfs_rq_clock_task(cfs_rq); |
8444 | int tg_update; | ||
8445 | 8683 | ||
8446 | if (!vruntime_normalized(p)) { | 8684 | if (!vruntime_normalized(p)) { |
8447 | /* | 8685 | /* |
@@ -8453,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8453 | } | 8691 | } |
8454 | 8692 | ||
8455 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8693 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8694 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8457 | detach_entity_load_avg(cfs_rq, se); | 8695 | detach_entity_load_avg(cfs_rq, se); |
8458 | if (tg_update) | 8696 | update_tg_load_avg(cfs_rq, false); |
8459 | update_tg_load_avg(cfs_rq, false); | ||
8460 | } | 8697 | } |
8461 | 8698 | ||
8462 | static void attach_task_cfs_rq(struct task_struct *p) | 8699 | static void attach_task_cfs_rq(struct task_struct *p) |
@@ -8464,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8464 | struct sched_entity *se = &p->se; | 8701 | struct sched_entity *se = &p->se; |
8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8702 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8466 | u64 now = cfs_rq_clock_task(cfs_rq); | 8703 | u64 now = cfs_rq_clock_task(cfs_rq); |
8467 | int tg_update; | ||
8468 | 8704 | ||
8469 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8705 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8470 | /* | 8706 | /* |
@@ -8475,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8475 | #endif | 8711 | #endif |
8476 | 8712 | ||
8477 | /* Synchronize task with its cfs_rq */ | 8713 | /* Synchronize task with its cfs_rq */ |
8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8714 | update_cfs_rq_load_avg(now, cfs_rq, false); |
8479 | attach_entity_load_avg(cfs_rq, se); | 8715 | attach_entity_load_avg(cfs_rq, se); |
8480 | if (tg_update) | 8716 | update_tg_load_avg(cfs_rq, false); |
8481 | update_tg_load_avg(cfs_rq, false); | ||
8482 | 8717 | ||
8483 | if (!vruntime_normalized(p)) | 8718 | if (!vruntime_normalized(p)) |
8484 | se->vruntime += cfs_rq->min_vruntime; | 8719 | se->vruntime += cfs_rq->min_vruntime; |