Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner: "A couple of scheduler fixes: - force watchdog reset while processing sysrq-w - fix a deadlock when enabling trace events in the scheduler - fixes to the throttled next buddy logic - fixes for the average accounting (missing serialization and underflow handling) - allow kernel threads for fallback to online but not active cpus" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Allow kthreads to fall back to online && !active cpus sched/fair: Do not announce throttled next buddy in dequeue_task_fair() sched/fair: Initialize throttle_count for new task-groups lazily sched/fair: Fix cfs_rq avg tracking underflow kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w sched/debug: Fix deadlock when enabling sched events sched/fair: Fix post_init_entity_util_avg() serialization
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-06-25 09:38:42 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-06-25 09:38:42 -0400
commit: 57801c1b817128cbb3a4dc45e6a1e0e31a227a19 (patch)
tree: f4ed8d503aacd4c53bb04586044ebc58ca9b1a81
parent: e3b22bc3d705b4a265247a9e2a1dea9ecf01a0cd (diff)
parent: feb245e304f343cf5e4f9123db36354144dce8a4 (diff)
3 files changed, 66 insertions, 21 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d5394f5dc..51d7105f529a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        for (;;) {
                /* Any allowed, online CPU? */
                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                        if (!cpu_active(dest_cpu))
+                        if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+                                continue;
+                        if (!cpu_online(dest_cpu))
                                continue;
                        goto out;
                }
@@ -2535,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p)
         */
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-        /* Post initialize new task's util average when its cfs_rq is set */
+        rq = __task_rq_lock(p, &rf);
        post_init_entity_util_avg(&p->se);
-        rq = __task_rq_lock(p, &rf);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
@@ -5148,14 +5149,16 @@ void show_state_filter(unsigned long state_filter)
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
+                 * Also, reset softlockup watchdogs on all CPUs, because
+                 * another CPU might be blocked waiting for us to process
+                 * an IPI.
                 */
                touch_nmi_watchdog();
+                touch_all_softlockup_watchdogs();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
        }
-        touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
        if (!state_filter)
                sysrq_sched_debug_show();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..bdcbeea90c95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
        }
 }
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                           \
+        typeof(_ptr) ptr = (_ptr);                              \
+        typeof(*ptr) val = (_val);                              \
+        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+        res = var - val;                                        \
+        if (res > var)                                          \
+                res = 0;                                        \
+        WRITE_ONCE(*ptr, res);                                  \
+} while (0)
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-                sa->load_avg = max_t(long, sa->load_avg - r, 0);
+                sub_positive(&sa->load_avg, r);
-                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                removed_load = 1;
        }
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-                sa->util_avg = max_t(long, sa->util_avg - r, 0);
+                sub_positive(&sa->util_avg, r);
-                sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                removed_util = 1;
        }
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
-        cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-        cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
-        cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-        cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
        cfs_rq_util_change(cfs_rq);
 }
@@ -3246,7 +3263,7 @@ static inline void check_schedstat_required(void)
                        trace_sched_stat_iowait_enabled()  ||
                        trace_sched_stat_blocked_enabled() ||
                        trace_sched_stat_runtime_enabled())  {
-                pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
                             "stat_blocked and stat_runtime require the "
                             "kernel parameter schedstats=enabled or "
                             "kernel.sched_schedstats=1\n");
@@ -4185,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
        if (!cfs_bandwidth_used())
                return;
+        /* Synchronize hierarchical throttle counter: */
+        if (unlikely(!cfs_rq->throttle_uptodate)) {
+                struct rq *rq = rq_of(cfs_rq);
+                struct cfs_rq *pcfs_rq;
+                struct task_group *tg;
+                cfs_rq->throttle_uptodate = 1;
+                /* Get closest up-to-date node, because leaves go first: */
+                for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                        pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                        if (pcfs_rq->throttle_uptodate)
+                                break;
+                }
+                if (tg) {
+                        cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                        cfs_rq->throttled_clock_task = rq_clock_task(rq);
+                }
+        }
        /* an active group must be handled by the update_curr()->put() path */
        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                return;
@@ -4500,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
+                        /* Avoid re-evaluating load for this entity: */
+                        se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
-                        if (task_sleep && parent_entity(se))
+                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
-                                set_next_buddy(parent_entity(se));
+                                set_next_buddy(se);
-                        /* avoid re-evaluating load for this entity */
-                        se = parent_entity(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
@@ -8496,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg)
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+        struct cfs_rq *cfs_rq;
+        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8525,7 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
+                raw_spin_lock_irq(&rq->lock);
                post_init_entity_util_avg(se);
+                raw_spin_unlock_irq(&rq->lock);
        }
        return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..7cbeb92a1cb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -437,7 +437,7 @@ struct cfs_rq {
        u64 throttled_clock, throttled_clock_task;
        u64 throttled_clock_task_time;
-        int throttled, throttle_count;
+        int throttled, throttle_count, throttle_uptodate;
        struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-06-25 09:38:42 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-06-25 09:38:42 -0400
commit	57801c1b817128cbb3a4dc45e6a1e0e31a227a19 (patch)
tree	f4ed8d503aacd4c53bb04586044ebc58ca9b1a81
parent	e3b22bc3d705b4a265247a9e2a1dea9ecf01a0cd (diff)
parent	feb245e304f343cf5e4f9123db36354144dce8a4 (diff)