diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-06-25 09:38:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-06-25 09:38:42 -0400 |
commit | 57801c1b817128cbb3a4dc45e6a1e0e31a227a19 (patch) | |
tree | f4ed8d503aacd4c53bb04586044ebc58ca9b1a81 | |
parent | e3b22bc3d705b4a265247a9e2a1dea9ecf01a0cd (diff) | |
parent | feb245e304f343cf5e4f9123db36354144dce8a4 (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner:
"A couple of scheduler fixes:
- force watchdog reset while processing sysrq-w
- fix a deadlock when enabling trace events in the scheduler
- fixes to the throttled next buddy logic
- fixes for the average accounting (missing serialization and
underflow handling)
- allow kernel threads for fallback to online but not active cpus"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/core: Allow kthreads to fall back to online && !active cpus
sched/fair: Do not announce throttled next buddy in dequeue_task_fair()
sched/fair: Initialize throttle_count for new task-groups lazily
sched/fair: Fix cfs_rq avg tracking underflow
kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w
sched/debug: Fix deadlock when enabling sched events
sched/fair: Fix post_init_entity_util_avg() serialization
-rw-r--r-- | kernel/sched/core.c | 13 | ||||
-rw-r--r-- | kernel/sched/fair.c | 72 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 |
3 files changed, 66 insertions, 21 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1536 | for (;;) { | 1536 | for (;;) { |
1537 | /* Any allowed, online CPU? */ | 1537 | /* Any allowed, online CPU? */ |
1538 | for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { | 1538 | for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { |
1539 | if (!cpu_active(dest_cpu)) | 1539 | if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) |
1540 | continue; | ||
1541 | if (!cpu_online(dest_cpu)) | ||
1540 | continue; | 1542 | continue; |
1541 | goto out; | 1543 | goto out; |
1542 | } | 1544 | } |
@@ -2535,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p) | |||
2535 | */ | 2537 | */ |
2536 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2538 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2537 | #endif | 2539 | #endif |
2538 | /* Post initialize new task's util average when its cfs_rq is set */ | 2540 | rq = __task_rq_lock(p, &rf); |
2539 | post_init_entity_util_avg(&p->se); | 2541 | post_init_entity_util_avg(&p->se); |
2540 | 2542 | ||
2541 | rq = __task_rq_lock(p, &rf); | ||
2542 | activate_task(rq, p, 0); | 2543 | activate_task(rq, p, 0); |
2543 | p->on_rq = TASK_ON_RQ_QUEUED; | 2544 | p->on_rq = TASK_ON_RQ_QUEUED; |
2544 | trace_sched_wakeup_new(p); | 2545 | trace_sched_wakeup_new(p); |
@@ -5148,14 +5149,16 @@ void show_state_filter(unsigned long state_filter) | |||
5148 | /* | 5149 | /* |
5149 | * reset the NMI-timeout, listing all files on a slow | 5150 | * reset the NMI-timeout, listing all files on a slow |
5150 | * console might take a lot of time: | 5151 | * console might take a lot of time: |
5152 | * Also, reset softlockup watchdogs on all CPUs, because | ||
5153 | * another CPU might be blocked waiting for us to process | ||
5154 | * an IPI. | ||
5151 | */ | 5155 | */ |
5152 | touch_nmi_watchdog(); | 5156 | touch_nmi_watchdog(); |
5157 | touch_all_softlockup_watchdogs(); | ||
5153 | if (!state_filter || (p->state & state_filter)) | 5158 | if (!state_filter || (p->state & state_filter)) |
5154 | sched_show_task(p); | 5159 | sched_show_task(p); |
5155 | } | 5160 | } |
5156 | 5161 | ||
5157 | touch_all_softlockup_watchdogs(); | ||
5158 | |||
5159 | #ifdef CONFIG_SCHED_DEBUG | 5162 | #ifdef CONFIG_SCHED_DEBUG |
5160 | if (!state_filter) | 5163 | if (!state_filter) |
5161 | sysrq_sched_debug_show(); | 5164 | sysrq_sched_debug_show(); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2904 | } | 2904 | } |
2905 | } | 2905 | } |
2906 | 2906 | ||
2907 | /* | ||
2908 | * Unsigned subtract and clamp on underflow. | ||
2909 | * | ||
2910 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
2911 | * memory. This allows lockless observations without ever seeing the negative | ||
2912 | * values. | ||
2913 | */ | ||
2914 | #define sub_positive(_ptr, _val) do { \ | ||
2915 | typeof(_ptr) ptr = (_ptr); \ | ||
2916 | typeof(*ptr) val = (_val); \ | ||
2917 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
2918 | res = var - val; \ | ||
2919 | if (res > var) \ | ||
2920 | res = 0; \ | ||
2921 | WRITE_ONCE(*ptr, res); \ | ||
2922 | } while (0) | ||
2923 | |||
2907 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | 2924 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ |
2908 | static inline int | 2925 | static inline int |
2909 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 2926 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
2913 | 2930 | ||
2914 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { | 2931 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { |
2915 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | 2932 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); |
2916 | sa->load_avg = max_t(long, sa->load_avg - r, 0); | 2933 | sub_positive(&sa->load_avg, r); |
2917 | sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); | 2934 | sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); |
2918 | removed_load = 1; | 2935 | removed_load = 1; |
2919 | } | 2936 | } |
2920 | 2937 | ||
2921 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 2938 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
2922 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | 2939 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
2923 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | 2940 | sub_positive(&sa->util_avg, r); |
2924 | sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); | 2941 | sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); |
2925 | removed_util = 1; | 2942 | removed_util = 1; |
2926 | } | 2943 | } |
2927 | 2944 | ||
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
2994 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | 3011 | &se->avg, se->on_rq * scale_load_down(se->load.weight), |
2995 | cfs_rq->curr == se, NULL); | 3012 | cfs_rq->curr == se, NULL); |
2996 | 3013 | ||
2997 | cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | 3014 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); |
2998 | cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | 3015 | sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); |
2999 | cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | 3016 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); |
3000 | cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | 3017 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); |
3001 | 3018 | ||
3002 | cfs_rq_util_change(cfs_rq); | 3019 | cfs_rq_util_change(cfs_rq); |
3003 | } | 3020 | } |
@@ -3246,7 +3263,7 @@ static inline void check_schedstat_required(void) | |||
3246 | trace_sched_stat_iowait_enabled() || | 3263 | trace_sched_stat_iowait_enabled() || |
3247 | trace_sched_stat_blocked_enabled() || | 3264 | trace_sched_stat_blocked_enabled() || |
3248 | trace_sched_stat_runtime_enabled()) { | 3265 | trace_sched_stat_runtime_enabled()) { |
3249 | pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " | 3266 | printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " |
3250 | "stat_blocked and stat_runtime require the " | 3267 | "stat_blocked and stat_runtime require the " |
3251 | "kernel parameter schedstats=enabled or " | 3268 | "kernel parameter schedstats=enabled or " |
3252 | "kernel.sched_schedstats=1\n"); | 3269 | "kernel.sched_schedstats=1\n"); |
@@ -4185,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
4185 | if (!cfs_bandwidth_used()) | 4202 | if (!cfs_bandwidth_used()) |
4186 | return; | 4203 | return; |
4187 | 4204 | ||
4205 | /* Synchronize hierarchical throttle counter: */ | ||
4206 | if (unlikely(!cfs_rq->throttle_uptodate)) { | ||
4207 | struct rq *rq = rq_of(cfs_rq); | ||
4208 | struct cfs_rq *pcfs_rq; | ||
4209 | struct task_group *tg; | ||
4210 | |||
4211 | cfs_rq->throttle_uptodate = 1; | ||
4212 | |||
4213 | /* Get closest up-to-date node, because leaves go first: */ | ||
4214 | for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { | ||
4215 | pcfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
4216 | if (pcfs_rq->throttle_uptodate) | ||
4217 | break; | ||
4218 | } | ||
4219 | if (tg) { | ||
4220 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
4221 | cfs_rq->throttled_clock_task = rq_clock_task(rq); | ||
4222 | } | ||
4223 | } | ||
4224 | |||
4188 | /* an active group must be handled by the update_curr()->put() path */ | 4225 | /* an active group must be handled by the update_curr()->put() path */ |
4189 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 4226 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
4190 | return; | 4227 | return; |
@@ -4500,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4500 | 4537 | ||
4501 | /* Don't dequeue parent if it has other entities besides us */ | 4538 | /* Don't dequeue parent if it has other entities besides us */ |
4502 | if (cfs_rq->load.weight) { | 4539 | if (cfs_rq->load.weight) { |
4540 | /* Avoid re-evaluating load for this entity: */ | ||
4541 | se = parent_entity(se); | ||
4503 | /* | 4542 | /* |
4504 | * Bias pick_next to pick a task from this cfs_rq, as | 4543 | * Bias pick_next to pick a task from this cfs_rq, as |
4505 | * p is sleeping when it is within its sched_slice. | 4544 | * p is sleeping when it is within its sched_slice. |
4506 | */ | 4545 | */ |
4507 | if (task_sleep && parent_entity(se)) | 4546 | if (task_sleep && se && !throttled_hierarchy(cfs_rq)) |
4508 | set_next_buddy(parent_entity(se)); | 4547 | set_next_buddy(se); |
4509 | |||
4510 | /* avoid re-evaluating load for this entity */ | ||
4511 | se = parent_entity(se); | ||
4512 | break; | 4548 | break; |
4513 | } | 4549 | } |
4514 | flags |= DEQUEUE_SLEEP; | 4550 | flags |= DEQUEUE_SLEEP; |
@@ -8496,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg) | |||
8496 | 8532 | ||
8497 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8533 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8498 | { | 8534 | { |
8499 | struct cfs_rq *cfs_rq; | ||
8500 | struct sched_entity *se; | 8535 | struct sched_entity *se; |
8536 | struct cfs_rq *cfs_rq; | ||
8537 | struct rq *rq; | ||
8501 | int i; | 8538 | int i; |
8502 | 8539 | ||
8503 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8540 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8512,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8512 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | 8549 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); |
8513 | 8550 | ||
8514 | for_each_possible_cpu(i) { | 8551 | for_each_possible_cpu(i) { |
8552 | rq = cpu_rq(i); | ||
8553 | |||
8515 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8554 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8516 | GFP_KERNEL, cpu_to_node(i)); | 8555 | GFP_KERNEL, cpu_to_node(i)); |
8517 | if (!cfs_rq) | 8556 | if (!cfs_rq) |
@@ -8525,7 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8525 | init_cfs_rq(cfs_rq); | 8564 | init_cfs_rq(cfs_rq); |
8526 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8565 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8527 | init_entity_runnable_average(se); | 8566 | init_entity_runnable_average(se); |
8567 | |||
8568 | raw_spin_lock_irq(&rq->lock); | ||
8528 | post_init_entity_util_avg(se); | 8569 | post_init_entity_util_avg(se); |
8570 | raw_spin_unlock_irq(&rq->lock); | ||
8529 | } | 8571 | } |
8530 | 8572 | ||
8531 | return 1; | 8573 | return 1; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -437,7 +437,7 @@ struct cfs_rq { | |||
437 | 437 | ||
438 | u64 throttled_clock, throttled_clock_task; | 438 | u64 throttled_clock, throttled_clock_task; |
439 | u64 throttled_clock_task_time; | 439 | u64 throttled_clock_task_time; |
440 | int throttled, throttle_count; | 440 | int throttled, throttle_count, throttle_uptodate; |
441 | struct list_head throttled_list; | 441 | struct list_head throttled_list; |
442 | #endif /* CONFIG_CFS_BANDWIDTH */ | 442 | #endif /* CONFIG_CFS_BANDWIDTH */ |
443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 443 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |