1 files changed, 156 insertions, 86 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 33a0676ea744..4603b9d8f30a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,9 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
@@ -162,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
 #ifdef HAVE_JUMP_LABEL
-#define jump_label_key__true  jump_label_key_enabled
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
 #define SCHED_FEAT(name, enabled)       \
        jump_label_key__##enabled ,
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
@@ -176,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
 static void sched_feat_disable(int i)
 {
-        if (jump_label_enabled(&sched_feat_keys[i]))
+        if (static_key_enabled(&sched_feat_keys[i]))
-                jump_label_dec(&sched_feat_keys[i]);
+                static_key_slow_dec(&sched_feat_keys[i]);
 }
 static void sched_feat_enable(int i)
 {
-        if (!jump_label_enabled(&sched_feat_keys[i]))
+        if (!static_key_enabled(&sched_feat_keys[i]))
-                jump_label_inc(&sched_feat_keys[i]);
+                static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -894,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        if (static_branch((&paravirt_steal_rq_enabled))) {
+        if (static_key_false((&paravirt_steal_rq_enabled))) {
                u64 st;
                steal = paravirt_steal_clock(cpu_of(rq));
@@ -1263,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        int dest_cpu;
        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        enum { cpuset, possible, fail } state = cpuset;
+        int dest_cpu;
        /* Look for allowed, online CPU in same node. */
-        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+        for_each_cpu(dest_cpu, nodemask) {
+                if (!cpu_online(dest_cpu))
+                        continue;
+                if (!cpu_active(dest_cpu))
+                        continue;
                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
+        }
-        /* Any allowed, online CPU? */
+        for (;;) {
-        dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
+                /* Any allowed, online CPU? */
-        if (dest_cpu < nr_cpu_ids)
+                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                return dest_cpu;
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        goto out;
+                }
-        /* No more Mr. Nice Guy. */
+                switch (state) {
-        dest_cpu = cpuset_cpus_allowed_fallback(p);
+                case cpuset:
-        /*
+                        /* No more Mr. Nice Guy. */
-         * Don't tell them about moving exiting tasks or
+                        cpuset_cpus_allowed_fallback(p);
-         * kernel threads (both mm NULL), since they never
+                        state = possible;
-         * leave kernel.
+                        break;
-         */
-        if (p->mm && printk_ratelimit()) {
+                case possible:
-                printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                        do_set_cpus_allowed(p, cpu_possible_mask);
-                                task_pid_nr(p), p->comm, cpu);
+                        state = fail;
+                        break;
+                case fail:
+                        BUG();
+                        break;
+                }
+        }
+out:
+        if (state != cpuset) {
+                /*
+                 * Don't tell them about moving exiting tasks or
+                 * kernel threads (both mm NULL), since they never
+                 * leave kernel.
+                 */
+                if (p->mm && printk_ratelimit()) {
+                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                        task_pid_nr(p), p->comm, cpu);
+                }
        }
        return dest_cpu;
@@ -1507,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
 {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
@@ -1518,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
@@ -1932,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
+        finish_arch_post_lock_switch();
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2266,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
 * Once we've updated the global active value, we need to apply the exponential
 * weights adjusted to the number of cycles missed.
 */
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
        long delta, active, n;
-        if (time_before(jiffies, calc_load_update))
-                return;
        /*
         * If we crossed a calc_load_update boundary, make sure to fold
         * any pending idle changes, the respective CPUs might have
@@ -2284,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
                atomic_long_add(delta, &calc_load_tasks);
        /*
-         * If we were idle for multiple load cycles, apply them.
+         * It could be the one fold was all it took, we done!
         */
-        if (ticks >= LOAD_FREQ) {
+        if (time_before(jiffies, calc_load_update + 10))
-                n = ticks / LOAD_FREQ;
+                return;
-                active = atomic_long_read(&calc_load_tasks);
+        /*
-                active = active > 0 ? active * FIXED_1 : 0;
+         * Catch-up, fold however many we are behind still
+         */
+        delta = jiffies - calc_load_update - 10;
+        n = 1 + (delta / LOAD_FREQ);
-                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+        active = atomic_long_read(&calc_load_tasks);
-                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+        active = active > 0 ? active * FIXED_1 : 0;
-                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-                calc_load_update += n * LOAD_FREQ;
+        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        }
+        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        /*
+        calc_load_update += n * LOAD_FREQ;
-         * Its possible the remainder of the above division also crosses
-         * a LOAD_FREQ period, the regular check in calc_global_load()
-         * which comes after this will take care of that.
-         *
-         * Consider us being 11 ticks before a cycle completion, and us
-         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-         * age us 4 cycles, and the test in calc_global_load() will
-         * pick up the final one.
-         */
 }
 #else
 void calc_load_account_idle(struct rq *this_rq)
@@ -2320,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
        return 0;
 }
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
 }
 #endif
@@ -2348,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
 {
        long active;
-        calc_global_nohz(ticks);
        if (time_before(jiffies, calc_load_update + 10))
                return;
@@ -2361,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
        calc_load_update += LOAD_FREQ;
+        /*
+         * Account one period with whatever state we found before
+         * folding in the nohz state and ageing the entire idle period.
+         *
+         * This avoids loosing a sample when we go idle between 
+         * calc_load_account_active() (10 ticks ago) and now and thus
+         * under-accounting.
+         */
+        calc_global_nohz();
 }
 /*
@@ -2755,7 +2787,7 @@ void account_idle_time(cputime_t cputime)
 static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
-        if (static_branch(&paravirt_steal_enabled)) {
+        if (static_key_false(&paravirt_steal_enabled)) {
                u64 steal, st = 0;
                steal = paravirt_steal_clock(smp_processor_id());
@@ -3070,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-        struct pt_regs *regs = get_irq_regs();
        if (oops_in_progress)
                return;
@@ -3082,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+        dump_stack();
-        if (regs)
-                show_regs(regs);
-        else
-                dump_stack();
 }
 /*
@@ -3220,14 +3246,14 @@ need_resched:
        post_schedule(rq);
-        preempt_enable_no_resched();
+        sched_preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-        if (!tsk->state)
+        if (!tsk->state || tsk_is_pi_blocked(tsk))
                return;
        /*
         * If we are going to sleep and we have plugged IO queued,
@@ -3246,6 +3272,18 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+        sched_preempt_enable_no_resched();
+        schedule();
+        preempt_disable();
+}
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@ -3406,9 +3444,9 @@ EXPORT_SYMBOL(__wake_up);
 /*
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
 */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 {
-        __wake_up_common(q, mode, 1, 0, NULL);
+        __wake_up_common(q, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
@@ -3767,6 +3805,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        rq = __task_rq_lock(p);
+        /*
+         * Idle task boosting is a nono in general. There is one
+         * exception, when PREEMPT_RT and NOHZ is active:
+         *
+         * The idle task calls get_next_timer_interrupt() and holds
+         * the timer wheel base->lock on the CPU and another CPU wants
+         * to access the timer (probably to cancel it). We can safely
+         * ignore the boosting request, as the idle CPU runs this code
+         * with interrupts disabled and will complete the lock
+         * protected section without being interrupted. So there is no
+         * real need to boost.
+         */
+        if (unlikely(p == rq->idle)) {
+                WARN_ON(p != rq->curr);
+                WARN_ON(p->pi_blocked_on);
+                goto out_unlock;
+        }
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -3790,11 +3846,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
        check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -4474,7 +4529,7 @@ SYSCALL_DEFINE0(sched_yield)
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
        do_raw_spin_unlock(&rq->lock);
-        preempt_enable_no_resched();
+        sched_preempt_enable_no_resched();
        schedule();
@@ -4548,8 +4603,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
 /**
 * yield - yield the current processor to other threads.
 *
- * This is a shortcut for kernel-space yielding - it marks the
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
- * thread runnable and calls sys_sched_yield().
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ *      yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
 */
 void __sched yield(void)
 {
@@ -5381,7 +5452,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_ONLINE:
+        case CPU_STARTING:
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5753,7 +5824,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
 */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
@@ -6728,7 +6799,7 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
                cpuset_update_active_cpus();
@@ -6741,7 +6812,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                cpuset_update_active_cpus();
                return NOTIFY_OK;
@@ -6930,6 +7001,9 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+                INIT_LIST_HEAD(&rq->cfs_tasks);
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
                rq->nohz_flags = 0;
@@ -7524,8 +7598,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                            struct task_group, css);
 }
-static struct cgroup_subsys_state *
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct task_group *tg, *parent;
@@ -7542,15 +7615,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        return &tg->css;
 }
-static void
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
        sched_destroy_group(tg);
 }
-static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
        struct task_struct *task;
@@ -7568,7 +7640,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        return 0;
 }
-static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup *cgrp,
                              struct cgroup_taskset *tset)
 {
        struct task_struct *task;
@@ -7578,8 +7650,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task)
+                struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7929,8 +8001,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 */
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
-        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca;
@@ -7960,8 +8031,7 @@ out:
 }
 /* destroy an existing cpu accounting group */
-static void
+static void cpuacct_destroy(struct cgroup *cgrp)
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);