aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-06-25 09:38:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-06-25 09:38:42 -0400
commit57801c1b817128cbb3a4dc45e6a1e0e31a227a19 (patch)
treef4ed8d503aacd4c53bb04586044ebc58ca9b1a81
parente3b22bc3d705b4a265247a9e2a1dea9ecf01a0cd (diff)
parentfeb245e304f343cf5e4f9123db36354144dce8a4 (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: "A couple of scheduler fixes: - force watchdog reset while processing sysrq-w - fix a deadlock when enabling trace events in the scheduler - fixes to the throttled next buddy logic - fixes for the average accounting (missing serialization and underflow handling) - allow kernel threads for fallback to online but not active cpus" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Allow kthreads to fall back to online && !active cpus sched/fair: Do not announce throttled next buddy in dequeue_task_fair() sched/fair: Initialize throttle_count for new task-groups lazily sched/fair: Fix cfs_rq avg tracking underflow kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w sched/debug: Fix deadlock when enabling sched events sched/fair: Fix post_init_entity_util_avg() serialization
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/fair.c72
-rw-r--r--kernel/sched/sched.h2
3 files changed, 66 insertions, 21 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d5394f5dc..51d7105f529a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1536 for (;;) { 1536 for (;;) {
1537 /* Any allowed, online CPU? */ 1537 /* Any allowed, online CPU? */
1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1539 if (!cpu_active(dest_cpu)) 1539 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1540 continue;
1541 if (!cpu_online(dest_cpu))
1540 continue; 1542 continue;
1541 goto out; 1543 goto out;
1542 } 1544 }
@@ -2535,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p)
2535 */ 2537 */
2536 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2538 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2537#endif 2539#endif
2538 /* Post initialize new task's util average when its cfs_rq is set */ 2540 rq = __task_rq_lock(p, &rf);
2539 post_init_entity_util_avg(&p->se); 2541 post_init_entity_util_avg(&p->se);
2540 2542
2541 rq = __task_rq_lock(p, &rf);
2542 activate_task(rq, p, 0); 2543 activate_task(rq, p, 0);
2543 p->on_rq = TASK_ON_RQ_QUEUED; 2544 p->on_rq = TASK_ON_RQ_QUEUED;
2544 trace_sched_wakeup_new(p); 2545 trace_sched_wakeup_new(p);
@@ -5148,14 +5149,16 @@ void show_state_filter(unsigned long state_filter)
5148 /* 5149 /*
5149 * reset the NMI-timeout, listing all files on a slow 5150 * reset the NMI-timeout, listing all files on a slow
5150 * console might take a lot of time: 5151 * console might take a lot of time:
5152 * Also, reset softlockup watchdogs on all CPUs, because
5153 * another CPU might be blocked waiting for us to process
5154 * an IPI.
5151 */ 5155 */
5152 touch_nmi_watchdog(); 5156 touch_nmi_watchdog();
5157 touch_all_softlockup_watchdogs();
5153 if (!state_filter || (p->state & state_filter)) 5158 if (!state_filter || (p->state & state_filter))
5154 sched_show_task(p); 5159 sched_show_task(p);
5155 } 5160 }
5156 5161
5157 touch_all_softlockup_watchdogs();
5158
5159#ifdef CONFIG_SCHED_DEBUG 5162#ifdef CONFIG_SCHED_DEBUG
5160 if (!state_filter) 5163 if (!state_filter)
5161 sysrq_sched_debug_show(); 5164 sysrq_sched_debug_show();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..bdcbeea90c95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2904 } 2904 }
2905} 2905}
2906 2906
2907/*
2908 * Unsigned subtract and clamp on underflow.
2909 *
2910 * Explicitly do a load-store to ensure the intermediate value never hits
2911 * memory. This allows lockless observations without ever seeing the negative
2912 * values.
2913 */
2914#define sub_positive(_ptr, _val) do { \
2915 typeof(_ptr) ptr = (_ptr); \
2916 typeof(*ptr) val = (_val); \
2917 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2918 res = var - val; \
2919 if (res > var) \
2920 res = 0; \
2921 WRITE_ONCE(*ptr, res); \
2922} while (0)
2923
2907/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2924/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2908static inline int 2925static inline int
2909update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 2926update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2913 2930
2914 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 2931 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2915 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 2932 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2916 sa->load_avg = max_t(long, sa->load_avg - r, 0); 2933 sub_positive(&sa->load_avg, r);
2917 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); 2934 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2918 removed_load = 1; 2935 removed_load = 1;
2919 } 2936 }
2920 2937
2921 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2938 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2922 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2939 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2923 sa->util_avg = max_t(long, sa->util_avg - r, 0); 2940 sub_positive(&sa->util_avg, r);
2924 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); 2941 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2925 removed_util = 1; 2942 removed_util = 1;
2926 } 2943 }
2927 2944
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
2994 &se->avg, se->on_rq * scale_load_down(se->load.weight), 3011 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2995 cfs_rq->curr == se, NULL); 3012 cfs_rq->curr == se, NULL);
2996 3013
2997 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); 3014 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2998 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); 3015 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
2999 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); 3016 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3000 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); 3017 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3001 3018
3002 cfs_rq_util_change(cfs_rq); 3019 cfs_rq_util_change(cfs_rq);
3003} 3020}
@@ -3246,7 +3263,7 @@ static inline void check_schedstat_required(void)
3246 trace_sched_stat_iowait_enabled() || 3263 trace_sched_stat_iowait_enabled() ||
3247 trace_sched_stat_blocked_enabled() || 3264 trace_sched_stat_blocked_enabled() ||
3248 trace_sched_stat_runtime_enabled()) { 3265 trace_sched_stat_runtime_enabled()) {
3249 pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3266 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3250 "stat_blocked and stat_runtime require the " 3267 "stat_blocked and stat_runtime require the "
3251 "kernel parameter schedstats=enabled or " 3268 "kernel parameter schedstats=enabled or "
3252 "kernel.sched_schedstats=1\n"); 3269 "kernel.sched_schedstats=1\n");
@@ -4185,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4185 if (!cfs_bandwidth_used()) 4202 if (!cfs_bandwidth_used())
4186 return; 4203 return;
4187 4204
4205 /* Synchronize hierarchical throttle counter: */
4206 if (unlikely(!cfs_rq->throttle_uptodate)) {
4207 struct rq *rq = rq_of(cfs_rq);
4208 struct cfs_rq *pcfs_rq;
4209 struct task_group *tg;
4210
4211 cfs_rq->throttle_uptodate = 1;
4212
4213 /* Get closest up-to-date node, because leaves go first: */
4214 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
4215 pcfs_rq = tg->cfs_rq[cpu_of(rq)];
4216 if (pcfs_rq->throttle_uptodate)
4217 break;
4218 }
4219 if (tg) {
4220 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4221 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4222 }
4223 }
4224
4188 /* an active group must be handled by the update_curr()->put() path */ 4225 /* an active group must be handled by the update_curr()->put() path */
4189 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 4226 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4190 return; 4227 return;
@@ -4500,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4500 4537
4501 /* Don't dequeue parent if it has other entities besides us */ 4538 /* Don't dequeue parent if it has other entities besides us */
4502 if (cfs_rq->load.weight) { 4539 if (cfs_rq->load.weight) {
4540 /* Avoid re-evaluating load for this entity: */
4541 se = parent_entity(se);
4503 /* 4542 /*
4504 * Bias pick_next to pick a task from this cfs_rq, as 4543 * Bias pick_next to pick a task from this cfs_rq, as
4505 * p is sleeping when it is within its sched_slice. 4544 * p is sleeping when it is within its sched_slice.
4506 */ 4545 */
4507 if (task_sleep && parent_entity(se)) 4546 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4508 set_next_buddy(parent_entity(se)); 4547 set_next_buddy(se);
4509
4510 /* avoid re-evaluating load for this entity */
4511 se = parent_entity(se);
4512 break; 4548 break;
4513 } 4549 }
4514 flags |= DEQUEUE_SLEEP; 4550 flags |= DEQUEUE_SLEEP;
@@ -8496,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg)
8496 8532
8497int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8533int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8498{ 8534{
8499 struct cfs_rq *cfs_rq;
8500 struct sched_entity *se; 8535 struct sched_entity *se;
8536 struct cfs_rq *cfs_rq;
8537 struct rq *rq;
8501 int i; 8538 int i;
8502 8539
8503 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8540 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8512 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8549 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513 8550
8514 for_each_possible_cpu(i) { 8551 for_each_possible_cpu(i) {
8552 rq = cpu_rq(i);
8553
8515 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8554 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8516 GFP_KERNEL, cpu_to_node(i)); 8555 GFP_KERNEL, cpu_to_node(i));
8517 if (!cfs_rq) 8556 if (!cfs_rq)
@@ -8525,7 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8525 init_cfs_rq(cfs_rq); 8564 init_cfs_rq(cfs_rq);
8526 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8565 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8527 init_entity_runnable_average(se); 8566 init_entity_runnable_average(se);
8567
8568 raw_spin_lock_irq(&rq->lock);
8528 post_init_entity_util_avg(se); 8569 post_init_entity_util_avg(se);
8570 raw_spin_unlock_irq(&rq->lock);
8529 } 8571 }
8530 8572
8531 return 1; 8573 return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..7cbeb92a1cb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -437,7 +437,7 @@ struct cfs_rq {
437 437
438 u64 throttled_clock, throttled_clock_task; 438 u64 throttled_clock, throttled_clock_task;
439 u64 throttled_clock_task_time; 439 u64 throttled_clock_task_time;
440 int throttled, throttle_count; 440 int throttled, throttle_count, throttle_uptodate;
441 struct list_head throttled_list; 441 struct list_head throttled_list;
442#endif /* CONFIG_CFS_BANDWIDTH */ 442#endif /* CONFIG_CFS_BANDWIDTH */
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 443#endif /* CONFIG_FAIR_GROUP_SCHED */