diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 418 |
1 files changed, 189 insertions, 229 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..0d97ebdc58f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
416 | 416 | ||
417 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 417 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
418 | 418 | ||
419 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 419 | static __always_inline |
420 | unsigned long delta_exec); | 420 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); |
421 | 421 | ||
422 | /************************************************************** | 422 | /************************************************************** |
423 | * Scheduling class tree data structure manipulation methods: | 423 | * Scheduling class tree data structure manipulation methods: |
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 776 | * Scheduling class queueing methods: |
777 | */ | 777 | */ |
778 | 778 | ||
779 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
780 | static void | ||
781 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
782 | { | ||
783 | cfs_rq->task_weight += weight; | ||
784 | } | ||
785 | #else | ||
786 | static inline void | ||
787 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
788 | { | ||
789 | } | ||
790 | #endif | ||
791 | |||
792 | static void | 779 | static void |
793 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
794 | { | 781 | { |
795 | update_load_add(&cfs_rq->load, se->load.weight); | 782 | update_load_add(&cfs_rq->load, se->load.weight); |
796 | if (!parent_entity(se)) | 783 | if (!parent_entity(se)) |
797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
798 | if (entity_is_task(se)) { | 785 | #ifdef CONFIG_SMP |
799 | add_cfs_task_weight(cfs_rq, se->load.weight); | 786 | if (entity_is_task(se)) |
800 | list_add(&se->group_node, &cfs_rq->tasks); | 787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
801 | } | 788 | #endif |
802 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
803 | } | 790 | } |
804 | 791 | ||
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
808 | update_load_sub(&cfs_rq->load, se->load.weight); | 795 | update_load_sub(&cfs_rq->load, se->load.weight); |
809 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 797 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
811 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) |
812 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
813 | list_del_init(&se->group_node); | 799 | list_del_init(&se->group_node); |
814 | } | ||
815 | cfs_rq->nr_running--; | 800 | cfs_rq->nr_running--; |
816 | } | 801 | } |
817 | 802 | ||
@@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1003 | if (unlikely(delta > se->statistics.sleep_max)) | 988 | if (unlikely(delta > se->statistics.sleep_max)) |
1004 | se->statistics.sleep_max = delta; | 989 | se->statistics.sleep_max = delta; |
1005 | 990 | ||
991 | se->statistics.sleep_start = 0; | ||
1006 | se->statistics.sum_sleep_runtime += delta; | 992 | se->statistics.sum_sleep_runtime += delta; |
1007 | 993 | ||
1008 | if (tsk) { | 994 | if (tsk) { |
@@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1019 | if (unlikely(delta > se->statistics.block_max)) | 1005 | if (unlikely(delta > se->statistics.block_max)) |
1020 | se->statistics.block_max = delta; | 1006 | se->statistics.block_max = delta; |
1021 | 1007 | ||
1008 | se->statistics.block_start = 0; | ||
1022 | se->statistics.sum_sleep_runtime += delta; | 1009 | se->statistics.sum_sleep_runtime += delta; |
1023 | 1010 | ||
1024 | if (tsk) { | 1011 | if (tsk) { |
@@ -1175,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1175 | __clear_buddies_skip(se); | 1162 | __clear_buddies_skip(se); |
1176 | } | 1163 | } |
1177 | 1164 | ||
1178 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | 1165 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); |
1179 | 1166 | ||
1180 | static void | 1167 | static void |
1181 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1168 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
@@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1399 | #ifdef CONFIG_CFS_BANDWIDTH | 1386 | #ifdef CONFIG_CFS_BANDWIDTH |
1400 | 1387 | ||
1401 | #ifdef HAVE_JUMP_LABEL | 1388 | #ifdef HAVE_JUMP_LABEL |
1402 | static struct jump_label_key __cfs_bandwidth_used; | 1389 | static struct static_key __cfs_bandwidth_used; |
1403 | 1390 | ||
1404 | static inline bool cfs_bandwidth_used(void) | 1391 | static inline bool cfs_bandwidth_used(void) |
1405 | { | 1392 | { |
1406 | return static_branch(&__cfs_bandwidth_used); | 1393 | return static_key_false(&__cfs_bandwidth_used); |
1407 | } | 1394 | } |
1408 | 1395 | ||
1409 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 1396 | void account_cfs_bandwidth_used(int enabled, int was_enabled) |
1410 | { | 1397 | { |
1411 | /* only need to count groups transitioning between enabled/!enabled */ | 1398 | /* only need to count groups transitioning between enabled/!enabled */ |
1412 | if (enabled && !was_enabled) | 1399 | if (enabled && !was_enabled) |
1413 | jump_label_inc(&__cfs_bandwidth_used); | 1400 | static_key_slow_inc(&__cfs_bandwidth_used); |
1414 | else if (!enabled && was_enabled) | 1401 | else if (!enabled && was_enabled) |
1415 | jump_label_dec(&__cfs_bandwidth_used); | 1402 | static_key_slow_dec(&__cfs_bandwidth_used); |
1416 | } | 1403 | } |
1417 | #else /* HAVE_JUMP_LABEL */ | 1404 | #else /* HAVE_JUMP_LABEL */ |
1418 | static bool cfs_bandwidth_used(void) | 1405 | static bool cfs_bandwidth_used(void) |
@@ -1559,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1559 | resched_task(rq_of(cfs_rq)->curr); | 1546 | resched_task(rq_of(cfs_rq)->curr); |
1560 | } | 1547 | } |
1561 | 1548 | ||
1562 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 1549 | static __always_inline |
1563 | unsigned long delta_exec) | 1550 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) |
1564 | { | 1551 | { |
1565 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) | 1552 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
1566 | return; | 1553 | return; |
@@ -2086,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2086 | } | 2073 | } |
2087 | 2074 | ||
2088 | #else /* CONFIG_CFS_BANDWIDTH */ | 2075 | #else /* CONFIG_CFS_BANDWIDTH */ |
2089 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2076 | static __always_inline |
2090 | unsigned long delta_exec) {} | 2077 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} |
2091 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2078 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2092 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 2079 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2093 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2080 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2094 | 2081 | ||
2095 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 2082 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
2096 | { | 2083 | { |
@@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2670 | /* | 2657 | /* |
2671 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2672 | */ | 2659 | */ |
2673 | rcu_read_lock(); | ||
2674 | |||
2675 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2676 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2677 | sg = sd->groups; | 2662 | sg = sd->groups; |
@@ -2693,8 +2678,6 @@ next: | |||
2693 | } while (sg != sd->groups); | 2678 | } while (sg != sd->groups); |
2694 | } | 2679 | } |
2695 | done: | 2680 | done: |
2696 | rcu_read_unlock(); | ||
2697 | |||
2698 | return target; | 2681 | return target; |
2699 | } | 2682 | } |
2700 | 2683 | ||
@@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2920 | return; | 2903 | return; |
2921 | 2904 | ||
2922 | /* | 2905 | /* |
2923 | * This is possible from callers such as pull_task(), in which we | 2906 | * This is possible from callers such as move_task(), in which we |
2924 | * unconditionally check_prempt_curr() after an enqueue (which may have | 2907 | * unconditionally check_prempt_curr() after an enqueue (which may have |
2925 | * lead to a throttle). This both saves work and prevents false | 2908 | * lead to a throttle). This both saves work and prevents false |
2926 | * next-buddy nomination below. | 2909 | * next-buddy nomination below. |
@@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3084 | * Fair scheduling class load-balancing methods: | 3067 | * Fair scheduling class load-balancing methods: |
3085 | */ | 3068 | */ |
3086 | 3069 | ||
3070 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3071 | |||
3072 | #define LBF_ALL_PINNED 0x01 | ||
3073 | #define LBF_NEED_BREAK 0x02 | ||
3074 | |||
3075 | struct lb_env { | ||
3076 | struct sched_domain *sd; | ||
3077 | |||
3078 | int src_cpu; | ||
3079 | struct rq *src_rq; | ||
3080 | |||
3081 | int dst_cpu; | ||
3082 | struct rq *dst_rq; | ||
3083 | |||
3084 | enum cpu_idle_type idle; | ||
3085 | long load_move; | ||
3086 | unsigned int flags; | ||
3087 | |||
3088 | unsigned int loop; | ||
3089 | unsigned int loop_break; | ||
3090 | unsigned int loop_max; | ||
3091 | }; | ||
3092 | |||
3087 | /* | 3093 | /* |
3088 | * pull_task - move a task from a remote runqueue to the local runqueue. | 3094 | * move_task - move a task from one runqueue to another runqueue. |
3089 | * Both runqueues must be locked. | 3095 | * Both runqueues must be locked. |
3090 | */ | 3096 | */ |
3091 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 3097 | static void move_task(struct task_struct *p, struct lb_env *env) |
3092 | struct rq *this_rq, int this_cpu) | ||
3093 | { | 3098 | { |
3094 | deactivate_task(src_rq, p, 0); | 3099 | deactivate_task(env->src_rq, p, 0); |
3095 | set_task_cpu(p, this_cpu); | 3100 | set_task_cpu(p, env->dst_cpu); |
3096 | activate_task(this_rq, p, 0); | 3101 | activate_task(env->dst_rq, p, 0); |
3097 | check_preempt_curr(this_rq, p, 0); | 3102 | check_preempt_curr(env->dst_rq, p, 0); |
3098 | } | 3103 | } |
3099 | 3104 | ||
3100 | /* | 3105 | /* |
@@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3129 | return delta < (s64)sysctl_sched_migration_cost; | 3134 | return delta < (s64)sysctl_sched_migration_cost; |
3130 | } | 3135 | } |
3131 | 3136 | ||
3132 | #define LBF_ALL_PINNED 0x01 | ||
3133 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3134 | #define LBF_HAD_BREAK 0x04 | ||
3135 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3136 | #define LBF_ABORT 0x10 | ||
3137 | |||
3138 | /* | 3137 | /* |
3139 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3138 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3140 | */ | 3139 | */ |
3141 | static | 3140 | static |
3142 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3141 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
3143 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3144 | int *lb_flags) | ||
3145 | { | 3142 | { |
3146 | int tsk_cache_hot = 0; | 3143 | int tsk_cache_hot = 0; |
3147 | /* | 3144 | /* |
@@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3150 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3147 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3151 | * 3) are cache-hot on their current CPU. | 3148 | * 3) are cache-hot on their current CPU. |
3152 | */ | 3149 | */ |
3153 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { | 3150 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3154 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3151 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3155 | return 0; | 3152 | return 0; |
3156 | } | 3153 | } |
3157 | *lb_flags &= ~LBF_ALL_PINNED; | 3154 | env->flags &= ~LBF_ALL_PINNED; |
3158 | 3155 | ||
3159 | if (task_running(rq, p)) { | 3156 | if (task_running(env->src_rq, p)) { |
3160 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3157 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
3161 | return 0; | 3158 | return 0; |
3162 | } | 3159 | } |
@@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3167 | * 2) too many balance attempts have failed. | 3164 | * 2) too many balance attempts have failed. |
3168 | */ | 3165 | */ |
3169 | 3166 | ||
3170 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); | 3167 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3171 | if (!tsk_cache_hot || | 3168 | if (!tsk_cache_hot || |
3172 | sd->nr_balance_failed > sd->cache_nice_tries) { | 3169 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3173 | #ifdef CONFIG_SCHEDSTATS | 3170 | #ifdef CONFIG_SCHEDSTATS |
3174 | if (tsk_cache_hot) { | 3171 | if (tsk_cache_hot) { |
3175 | schedstat_inc(sd, lb_hot_gained[idle]); | 3172 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3176 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3173 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3177 | } | 3174 | } |
3178 | #endif | 3175 | #endif |
@@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3193 | * | 3190 | * |
3194 | * Called with both runqueues locked. | 3191 | * Called with both runqueues locked. |
3195 | */ | 3192 | */ |
3196 | static int | 3193 | static int move_one_task(struct lb_env *env) |
3197 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3198 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3199 | { | 3194 | { |
3200 | struct task_struct *p, *n; | 3195 | struct task_struct *p, *n; |
3201 | struct cfs_rq *cfs_rq; | ||
3202 | int pinned = 0; | ||
3203 | 3196 | ||
3204 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 3197 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3205 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3198 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) |
3206 | if (throttled_lb_pair(task_group(p), | 3199 | continue; |
3207 | busiest->cpu, this_cpu)) | ||
3208 | break; | ||
3209 | 3200 | ||
3210 | if (!can_migrate_task(p, busiest, this_cpu, | 3201 | if (!can_migrate_task(p, env)) |
3211 | sd, idle, &pinned)) | 3202 | continue; |
3212 | continue; | ||
3213 | 3203 | ||
3214 | pull_task(busiest, p, this_rq, this_cpu); | 3204 | move_task(p, env); |
3215 | /* | 3205 | /* |
3216 | * Right now, this is only the second place pull_task() | 3206 | * Right now, this is only the second place move_task() |
3217 | * is called, so we can safely collect pull_task() | 3207 | * is called, so we can safely collect move_task() |
3218 | * stats here rather than inside pull_task(). | 3208 | * stats here rather than inside move_task(). |
3219 | */ | 3209 | */ |
3220 | schedstat_inc(sd, lb_gained[idle]); | 3210 | schedstat_inc(env->sd, lb_gained[env->idle]); |
3221 | return 1; | 3211 | return 1; |
3222 | } | ||
3223 | } | 3212 | } |
3224 | |||
3225 | return 0; | 3213 | return 0; |
3226 | } | 3214 | } |
3227 | 3215 | ||
3228 | static unsigned long | 3216 | static unsigned long task_h_load(struct task_struct *p); |
3229 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3217 | |
3230 | unsigned long max_load_move, struct sched_domain *sd, | 3218 | /* |
3231 | enum cpu_idle_type idle, int *lb_flags, | 3219 | * move_tasks tries to move up to load_move weighted load from busiest to |
3232 | struct cfs_rq *busiest_cfs_rq) | 3220 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | ||
3222 | * | ||
3223 | * Called with both runqueues locked. | ||
3224 | */ | ||
3225 | static int move_tasks(struct lb_env *env) | ||
3233 | { | 3226 | { |
3234 | int loops = 0, pulled = 0; | 3227 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
3235 | long rem_load_move = max_load_move; | 3228 | struct task_struct *p; |
3236 | struct task_struct *p, *n; | 3229 | unsigned long load; |
3230 | int pulled = 0; | ||
3231 | |||
3232 | if (env->load_move <= 0) | ||
3233 | return 0; | ||
3237 | 3234 | ||
3238 | if (max_load_move == 0) | 3235 | while (!list_empty(tasks)) { |
3239 | goto out; | 3236 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
3240 | 3237 | ||
3241 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3238 | env->loop++; |
3242 | if (loops++ > sysctl_sched_nr_migrate) { | 3239 | /* We've more or less seen every task there is, call it quits */ |
3243 | *lb_flags |= LBF_NEED_BREAK; | 3240 | if (env->loop > env->loop_max) |
3241 | break; | ||
3242 | |||
3243 | /* take a breather every nr_migrate tasks */ | ||
3244 | if (env->loop > env->loop_break) { | ||
3245 | env->loop_break += sysctl_sched_nr_migrate; | ||
3246 | env->flags |= LBF_NEED_BREAK; | ||
3244 | break; | 3247 | break; |
3245 | } | 3248 | } |
3246 | 3249 | ||
3247 | if ((p->se.load.weight >> 1) > rem_load_move || | 3250 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
3248 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3251 | goto next; |
3249 | lb_flags)) | 3252 | |
3250 | continue; | 3253 | load = task_h_load(p); |
3254 | |||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | ||
3256 | goto next; | ||
3257 | |||
3258 | if ((load / 2) > env->load_move) | ||
3259 | goto next; | ||
3251 | 3260 | ||
3252 | pull_task(busiest, p, this_rq, this_cpu); | 3261 | if (!can_migrate_task(p, env)) |
3262 | goto next; | ||
3263 | |||
3264 | move_task(p, env); | ||
3253 | pulled++; | 3265 | pulled++; |
3254 | rem_load_move -= p->se.load.weight; | 3266 | env->load_move -= load; |
3255 | 3267 | ||
3256 | #ifdef CONFIG_PREEMPT | 3268 | #ifdef CONFIG_PREEMPT |
3257 | /* | 3269 | /* |
@@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3259 | * kernels will stop after the first task is pulled to minimize | 3271 | * kernels will stop after the first task is pulled to minimize |
3260 | * the critical section. | 3272 | * the critical section. |
3261 | */ | 3273 | */ |
3262 | if (idle == CPU_NEWLY_IDLE) { | 3274 | if (env->idle == CPU_NEWLY_IDLE) |
3263 | *lb_flags |= LBF_ABORT; | ||
3264 | break; | 3275 | break; |
3265 | } | ||
3266 | #endif | 3276 | #endif |
3267 | 3277 | ||
3268 | /* | 3278 | /* |
3269 | * We only want to steal up to the prescribed amount of | 3279 | * We only want to steal up to the prescribed amount of |
3270 | * weighted load. | 3280 | * weighted load. |
3271 | */ | 3281 | */ |
3272 | if (rem_load_move <= 0) | 3282 | if (env->load_move <= 0) |
3273 | break; | 3283 | break; |
3284 | |||
3285 | continue; | ||
3286 | next: | ||
3287 | list_move_tail(&p->se.group_node, tasks); | ||
3274 | } | 3288 | } |
3275 | out: | 3289 | |
3276 | /* | 3290 | /* |
3277 | * Right now, this is one of only two places pull_task() is called, | 3291 | * Right now, this is one of only two places move_task() is called, |
3278 | * so we can safely collect pull_task() stats here rather than | 3292 | * so we can safely collect move_task() stats here rather than |
3279 | * inside pull_task(). | 3293 | * inside move_task(). |
3280 | */ | 3294 | */ |
3281 | schedstat_add(sd, lb_gained[idle], pulled); | 3295 | schedstat_add(env->sd, lb_gained[env->idle], pulled); |
3282 | 3296 | ||
3283 | return max_load_move - rem_load_move; | 3297 | return pulled; |
3284 | } | 3298 | } |
3285 | 3299 | ||
3286 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3300 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3360 | 3374 | ||
3361 | static void update_h_load(long cpu) | 3375 | static void update_h_load(long cpu) |
3362 | { | 3376 | { |
3377 | rcu_read_lock(); | ||
3363 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3378 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3379 | rcu_read_unlock(); | ||
3364 | } | 3380 | } |
3365 | 3381 | ||
3366 | static unsigned long | 3382 | static unsigned long task_h_load(struct task_struct *p) |
3367 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3368 | unsigned long max_load_move, | ||
3369 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3370 | int *lb_flags) | ||
3371 | { | 3383 | { |
3372 | long rem_load_move = max_load_move; | 3384 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
3373 | struct cfs_rq *busiest_cfs_rq; | 3385 | unsigned long load; |
3374 | |||
3375 | rcu_read_lock(); | ||
3376 | update_h_load(cpu_of(busiest)); | ||
3377 | |||
3378 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { | ||
3379 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
3380 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
3381 | u64 rem_load, moved_load; | ||
3382 | |||
3383 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3384 | break; | ||
3385 | |||
3386 | /* | ||
3387 | * empty group or part of a throttled hierarchy | ||
3388 | */ | ||
3389 | if (!busiest_cfs_rq->task_weight || | ||
3390 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
3391 | continue; | ||
3392 | |||
3393 | rem_load = (u64)rem_load_move * busiest_weight; | ||
3394 | rem_load = div_u64(rem_load, busiest_h_load + 1); | ||
3395 | |||
3396 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | ||
3397 | rem_load, sd, idle, lb_flags, | ||
3398 | busiest_cfs_rq); | ||
3399 | |||
3400 | if (!moved_load) | ||
3401 | continue; | ||
3402 | 3386 | ||
3403 | moved_load *= busiest_h_load; | 3387 | load = p->se.load.weight; |
3404 | moved_load = div_u64(moved_load, busiest_weight + 1); | 3388 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); |
3405 | 3389 | ||
3406 | rem_load_move -= moved_load; | 3390 | return load; |
3407 | if (rem_load_move < 0) | ||
3408 | break; | ||
3409 | } | ||
3410 | rcu_read_unlock(); | ||
3411 | |||
3412 | return max_load_move - rem_load_move; | ||
3413 | } | 3391 | } |
3414 | #else | 3392 | #else |
3415 | static inline void update_shares(int cpu) | 3393 | static inline void update_shares(int cpu) |
3416 | { | 3394 | { |
3417 | } | 3395 | } |
3418 | 3396 | ||
3419 | static unsigned long | 3397 | static inline void update_h_load(long cpu) |
3420 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3421 | unsigned long max_load_move, | ||
3422 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3423 | int *lb_flags) | ||
3424 | { | 3398 | { |
3425 | return balance_tasks(this_rq, this_cpu, busiest, | ||
3426 | max_load_move, sd, idle, lb_flags, | ||
3427 | &busiest->cfs); | ||
3428 | } | 3399 | } |
3429 | #endif | ||
3430 | 3400 | ||
3431 | /* | 3401 | static unsigned long task_h_load(struct task_struct *p) |
3432 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3433 | * this_rq, as part of a balancing operation within domain "sd". | ||
3434 | * Returns 1 if successful and 0 otherwise. | ||
3435 | * | ||
3436 | * Called with both runqueues locked. | ||
3437 | */ | ||
3438 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3439 | unsigned long max_load_move, | ||
3440 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3441 | int *lb_flags) | ||
3442 | { | 3402 | { |
3443 | unsigned long total_load_moved = 0, load_moved; | 3403 | return p->se.load.weight; |
3444 | |||
3445 | do { | ||
3446 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | ||
3447 | max_load_move - total_load_moved, | ||
3448 | sd, idle, lb_flags); | ||
3449 | |||
3450 | total_load_moved += load_moved; | ||
3451 | |||
3452 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3453 | break; | ||
3454 | |||
3455 | #ifdef CONFIG_PREEMPT | ||
3456 | /* | ||
3457 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3458 | * kernels will stop after the first task is pulled to minimize | ||
3459 | * the critical section. | ||
3460 | */ | ||
3461 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { | ||
3462 | *lb_flags |= LBF_ABORT; | ||
3463 | break; | ||
3464 | } | ||
3465 | #endif | ||
3466 | } while (load_moved && max_load_move > total_load_moved); | ||
3467 | |||
3468 | return total_load_moved > 0; | ||
3469 | } | 3404 | } |
3405 | #endif | ||
3470 | 3406 | ||
3471 | /********** Helpers for find_busiest_group ************************/ | 3407 | /********** Helpers for find_busiest_group ************************/ |
3472 | /* | 3408 | /* |
@@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3776 | struct sched_domain *child = sd->child; | 3712 | struct sched_domain *child = sd->child; |
3777 | struct sched_group *group, *sdg = sd->groups; | 3713 | struct sched_group *group, *sdg = sd->groups; |
3778 | unsigned long power; | 3714 | unsigned long power; |
3715 | unsigned long interval; | ||
3716 | |||
3717 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3718 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
3719 | sdg->sgp->next_update = jiffies + interval; | ||
3779 | 3720 | ||
3780 | if (!child) { | 3721 | if (!child) { |
3781 | update_cpu_power(sd, cpu); | 3722 | update_cpu_power(sd, cpu); |
@@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3883 | * domains. In the newly idle case, we will allow all the cpu's | 3824 | * domains. In the newly idle case, we will allow all the cpu's |
3884 | * to do the newly idle load balance. | 3825 | * to do the newly idle load balance. |
3885 | */ | 3826 | */ |
3886 | if (idle != CPU_NEWLY_IDLE && local_group) { | 3827 | if (local_group) { |
3887 | if (balance_cpu != this_cpu) { | 3828 | if (idle != CPU_NEWLY_IDLE) { |
3888 | *balance = 0; | 3829 | if (balance_cpu != this_cpu) { |
3889 | return; | 3830 | *balance = 0; |
3890 | } | 3831 | return; |
3891 | update_group_power(sd, this_cpu); | 3832 | } |
3833 | update_group_power(sd, this_cpu); | ||
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
3835 | update_group_power(sd, this_cpu); | ||
3892 | } | 3836 | } |
3893 | 3837 | ||
3894 | /* Adjust by relative CPU power of the group */ | 3838 | /* Adjust by relative CPU power of the group */ |
@@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4451 | struct sched_domain *sd, enum cpu_idle_type idle, | 4395 | struct sched_domain *sd, enum cpu_idle_type idle, |
4452 | int *balance) | 4396 | int *balance) |
4453 | { | 4397 | { |
4454 | int ld_moved, lb_flags = 0, active_balance = 0; | 4398 | int ld_moved, active_balance = 0; |
4455 | struct sched_group *group; | 4399 | struct sched_group *group; |
4456 | unsigned long imbalance; | 4400 | unsigned long imbalance; |
4457 | struct rq *busiest; | 4401 | struct rq *busiest; |
4458 | unsigned long flags; | 4402 | unsigned long flags; |
4459 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4460 | 4404 | ||
4405 | struct lb_env env = { | ||
4406 | .sd = sd, | ||
4407 | .dst_cpu = this_cpu, | ||
4408 | .dst_rq = this_rq, | ||
4409 | .idle = idle, | ||
4410 | .loop_break = sysctl_sched_nr_migrate, | ||
4411 | }; | ||
4412 | |||
4461 | cpumask_copy(cpus, cpu_active_mask); | 4413 | cpumask_copy(cpus, cpu_active_mask); |
4462 | 4414 | ||
4463 | schedstat_inc(sd, lb_count[idle]); | 4415 | schedstat_inc(sd, lb_count[idle]); |
@@ -4492,32 +4444,34 @@ redo: | |||
4492 | * still unbalanced. ld_moved simply stays zero, so it is | 4444 | * still unbalanced. ld_moved simply stays zero, so it is |
4493 | * correctly treated as an imbalance. | 4445 | * correctly treated as an imbalance. |
4494 | */ | 4446 | */ |
4495 | lb_flags |= LBF_ALL_PINNED; | 4447 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | ||
4449 | env.src_cpu = busiest->cpu; | ||
4450 | env.src_rq = busiest; | ||
4451 | env.loop_max = busiest->nr_running; | ||
4452 | |||
4453 | more_balance: | ||
4496 | local_irq_save(flags); | 4454 | local_irq_save(flags); |
4497 | double_rq_lock(this_rq, busiest); | 4455 | double_rq_lock(this_rq, busiest); |
4498 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4456 | if (!env.loop) |
4499 | imbalance, sd, idle, &lb_flags); | 4457 | update_h_load(env.src_cpu); |
4458 | ld_moved += move_tasks(&env); | ||
4500 | double_rq_unlock(this_rq, busiest); | 4459 | double_rq_unlock(this_rq, busiest); |
4501 | local_irq_restore(flags); | 4460 | local_irq_restore(flags); |
4502 | 4461 | ||
4462 | if (env.flags & LBF_NEED_BREAK) { | ||
4463 | env.flags &= ~LBF_NEED_BREAK; | ||
4464 | goto more_balance; | ||
4465 | } | ||
4466 | |||
4503 | /* | 4467 | /* |
4504 | * some other cpu did the load balance for us. | 4468 | * some other cpu did the load balance for us. |
4505 | */ | 4469 | */ |
4506 | if (ld_moved && this_cpu != smp_processor_id()) | 4470 | if (ld_moved && this_cpu != smp_processor_id()) |
4507 | resched_cpu(this_cpu); | 4471 | resched_cpu(this_cpu); |
4508 | 4472 | ||
4509 | if (lb_flags & LBF_ABORT) | ||
4510 | goto out_balanced; | ||
4511 | |||
4512 | if (lb_flags & LBF_NEED_BREAK) { | ||
4513 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4514 | if (lb_flags & LBF_ABORT) | ||
4515 | goto out_balanced; | ||
4516 | goto redo; | ||
4517 | } | ||
4518 | |||
4519 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4473 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4520 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { | 4474 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4521 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4475 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4522 | if (!cpumask_empty(cpus)) | 4476 | if (!cpumask_empty(cpus)) |
4523 | goto redo; | 4477 | goto redo; |
@@ -4547,7 +4501,7 @@ redo: | |||
4547 | tsk_cpus_allowed(busiest->curr))) { | 4501 | tsk_cpus_allowed(busiest->curr))) { |
4548 | raw_spin_unlock_irqrestore(&busiest->lock, | 4502 | raw_spin_unlock_irqrestore(&busiest->lock, |
4549 | flags); | 4503 | flags); |
4550 | lb_flags |= LBF_ALL_PINNED; | 4504 | env.flags |= LBF_ALL_PINNED; |
4551 | goto out_one_pinned; | 4505 | goto out_one_pinned; |
4552 | } | 4506 | } |
4553 | 4507 | ||
@@ -4600,7 +4554,7 @@ out_balanced: | |||
4600 | 4554 | ||
4601 | out_one_pinned: | 4555 | out_one_pinned: |
4602 | /* tune up the balancing interval */ | 4556 | /* tune up the balancing interval */ |
4603 | if (((lb_flags & LBF_ALL_PINNED) && | 4557 | if (((env.flags & LBF_ALL_PINNED) && |
4604 | sd->balance_interval < MAX_PINNED_INTERVAL) || | 4558 | sd->balance_interval < MAX_PINNED_INTERVAL) || |
4605 | (sd->balance_interval < sd->max_interval)) | 4559 | (sd->balance_interval < sd->max_interval)) |
4606 | sd->balance_interval *= 2; | 4560 | sd->balance_interval *= 2; |
@@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data) | |||
4710 | } | 4664 | } |
4711 | 4665 | ||
4712 | if (likely(sd)) { | 4666 | if (likely(sd)) { |
4667 | struct lb_env env = { | ||
4668 | .sd = sd, | ||
4669 | .dst_cpu = target_cpu, | ||
4670 | .dst_rq = target_rq, | ||
4671 | .src_cpu = busiest_rq->cpu, | ||
4672 | .src_rq = busiest_rq, | ||
4673 | .idle = CPU_IDLE, | ||
4674 | }; | ||
4675 | |||
4713 | schedstat_inc(sd, alb_count); | 4676 | schedstat_inc(sd, alb_count); |
4714 | 4677 | ||
4715 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 4678 | if (move_one_task(&env)) |
4716 | sd, CPU_IDLE)) | ||
4717 | schedstat_inc(sd, alb_pushed); | 4679 | schedstat_inc(sd, alb_pushed); |
4718 | else | 4680 | else |
4719 | schedstat_inc(sd, alb_failed); | 4681 | schedstat_inc(sd, alb_failed); |
@@ -4945,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4945 | 4907 | ||
4946 | static DEFINE_SPINLOCK(balancing); | 4908 | static DEFINE_SPINLOCK(balancing); |
4947 | 4909 | ||
4948 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
4949 | |||
4950 | /* | 4910 | /* |
4951 | * Scale the max load_balance interval with the number of CPUs in the system. | 4911 | * Scale the max load_balance interval with the number of CPUs in the system. |
4952 | * This trades load-balance latency on larger machines for less cross talk. | 4912 | * This trades load-balance latency on larger machines for less cross talk. |
@@ -5340,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq) | |||
5340 | void init_cfs_rq(struct cfs_rq *cfs_rq) | 5300 | void init_cfs_rq(struct cfs_rq *cfs_rq) |
5341 | { | 5301 | { |
5342 | cfs_rq->tasks_timeline = RB_ROOT; | 5302 | cfs_rq->tasks_timeline = RB_ROOT; |
5343 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5344 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 5303 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
5345 | #ifndef CONFIG_64BIT | 5304 | #ifndef CONFIG_64BIT |
5346 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5305 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
@@ -5612,6 +5571,7 @@ __init void init_sched_fair_class(void) | |||
5612 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 5571 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
5613 | 5572 | ||
5614 | #ifdef CONFIG_NO_HZ | 5573 | #ifdef CONFIG_NO_HZ |
5574 | nohz.next_balance = jiffies; | ||
5615 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 5575 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
5616 | cpu_notifier(sched_ilb_notifier, 0); | 5576 | cpu_notifier(sched_ilb_notifier, 0); |
5617 | #endif | 5577 | #endif |