8 files changed, 170 insertions, 57 deletions
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d..0d2058da80f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 }
 /*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-void early_boot_irqs_off(void)
-{
-        early_boot_irqs_enabled = 0;
-}
-void early_boot_irqs_on(void)
-{
-        early_boot_irqs_enabled = 1;
-}
-/*
 * Hardirqs will be enabled:
 */
 void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
        if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index ea3e5eff387..18d38e4ec7b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
        /* try_to_wake_up() stats */
        unsigned int ttwu_count;
        unsigned int ttwu_local;
-        /* BKL stats */
-        unsigned int bkl_count;
 #endif
 };
@@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
        struct cgroup_subsys_state *css;
+        if (p->flags & PF_EXITING)
+                return &root_task_group;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
@@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), bkl_count);
+                schedstat_inc(this_rq(), rq_sched_info.bkl_count);
                schedstat_inc(prev, sched_info.bkl_count);
        }
 #endif
@@ -4871,7 +4871,8 @@ recheck:
                 * assigned.
                 */
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                                !task_group_is_autogroup(task_group(p))) {
                        __task_rq_unlock(rq);
                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
@@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        }
 }
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+        /*
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        sched_move_task(task);
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
@@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .destroy        = cpu_cgroup_destroy,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
+        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
        .early_init     = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84..9fb65628315 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
 {
        struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* We've redirected RT tasks to the root task group... */
+        ag->tg->rt_se = NULL;
+        ag->tg->rt_rq = NULL;
+#endif
        sched_destroy_group(ag->tg);
 }
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
        return ag;
 }
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
 static inline struct autogroup *autogroup_create(void)
 {
        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
        ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Autogroup RT tasks are redirected to the root task group
+         * so we don't have to move tasks around upon policy change,
+         * or flail around trying to allocate bandwidth on the fly.
+         * A bandwidth exception in __sched_setscheduler() allows
+         * the policy change to proceed.  Thereafter, task_group()
+         * returns &root_task_group, so zero bandwidth is required.
+         */
+        free_rt_sched_group(tg);
+        tg->rt_se = root_task_group.rt_se;
+        tg->rt_rq = root_task_group.rt_rq;
+#endif
        tg->autogroup = ag;
        return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        return true;
 }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return tg != &root_task_group && tg->autogroup;
+}
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
@@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (!enabled || !tg->autogroup)
+                return 0;
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb2..7b859ffe5da 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
 static inline void autogroup_init(struct task_struct *init_task) {  }
 static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return 0;
+}
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b..eb6cb8edd07 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
 /*
 * This allows printing both to /proc/sched_debug and
 * to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 }
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+static char *task_group_path(struct task_group *tg)
+{
+        if (autogroup_path(tg, group_path, PATH_MAX))
+                return group_path;
+        /*
+         * May be NULL if the underlying cgroup isn't fully-created yet
+         */
+        if (!tg->css.cgroup) {
+                group_path[0] = '\0';
+                return group_path;
+        }
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
+}
+#endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+        SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
        SEQ_printf(m, "\n");
 }
@@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
 #ifdef CONFIG_X86
        {
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(ttwu_count);
        P(ttwu_local);
-        P(bkl_count);
+        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+                                rq->rq_sched_info.bkl_count);
 #undef P
+#undef P64
 #endif
+        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
+        rcu_read_lock();
        print_rq(m, rq, cpu);
+        rcu_read_unlock();
+        spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf..77e9166d7bb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                struct sched_entity *se = __pick_next_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
+                if (delta < 0)
+                        return;
                if (delta > ideal_runtime)
                        resched_task(rq_of(cfs_rq)->curr);
        }
@@ -1362,27 +1365,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                return wl;
        for_each_sched_entity(se) {
-                long S, rw, s, a, b;
+                long lw, w;
-                S = se->my_q->tg->shares;
+                tg = se->my_q->tg;
-                s = se->load.weight;
+                w = se->my_q->load.weight;
-                rw = se->my_q->load.weight;
-                a = S*(rw + wl);
+                /* use this cpu's instantaneous contribution */
-                b = S*rw + s*wg;
+                lw = atomic_read(&tg->load_weight);
+                lw -= se->my_q->load_contribution;
+                lw += w + wg;
-                wl = s*(a-b);
+                wl += w;
-                if (likely(b))
+                if (lw > 0 && wl < lw)
-                        wl /= b;
+                        wl = (wl * tg->shares) / lw;
+                else
+                        wl = tg->shares;
-                /*
+                /* zero point is MIN_SHARES */
-                 * Assume the group is already running and will
+                if (wl < MIN_SHARES)
-                 * thus already be accounted for in the weight.
+                        wl = MIN_SHARES;
-                 *
+                wl -= se->load.weight;
-                 * That is, moving shares between CPUs, does not
-                 * alter the group weight.
-                 */
                wg = 0;
        }
diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e06998..9910744f085 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
+                void (*func) (void *info);
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+                /*
+                 * Since we walk the list without any locks, we might
+                 * see an entry that was completed, removed from the
+                 * list and is in the process of being reused.
+                 *
+                 * We must check that the cpu is in the cpumask before
+                 * checking the refs, and both must be set before
+                 * executing the callback on this cpu.
+                 */
+                if (!cpumask_test_cpu(cpu, data->cpumask))
+                        continue;
+                smp_rmb();
+                if (atomic_read(&data->refs) == 0)
                        continue;
+                func = data->csd.func;                  /* for later warn */
                data->csd.func(data->csd.info);
+                /*
+                 * If the cpu mask is not still set then it enabled interrupts,
+                 * we took another smp interrupt, and executed the function
+                 * twice on this cpu.  In theory that copy decremented refs.
+                 */
+                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+                        WARN(1, "%pS enabled interrupts and double executed\n",
+                             func);
+                        continue;
+                }
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
-                if (!refs) {
-                        raw_spin_lock(&call_function.lock);
-                        list_del_rcu(&data->csd.list);
-                        raw_spin_unlock(&call_function.lock);
-                }
                if (refs)
                        continue;
+                WARN_ON(!cpumask_empty(data->cpumask));
+                raw_spin_lock(&call_function.lock);
+                list_del_rcu(&data->csd.list);
+                raw_spin_unlock(&call_function.lock);
                csd_unlock(&data->csd);
        }
@@ -430,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-                     && !oops_in_progress);
+                     && !oops_in_progress && !early_boot_irqs_disabled);
        /* So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -454,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
        data->csd.func = func;
        data->csd.info = info;
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
+        /*
+         * To ensure the interrupt handler gets an complete view
+         * we order the cpumask and refs writes and order the read
+         * of them in the interrupt handler.  In addition we may
+         * only clear our own cpu bit from the mask.
+         */
+        smp_wmb();
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
        raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -533,17 +572,20 @@ void ipi_call_unlock_irq(void)
 #endif /* USE_GENERIC_SMP_HELPERS */
 /*
- * Call a function on all processors
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
 */
 int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
+        unsigned long flags;
        int ret = 0;
        preempt_disable();
        ret = smp_call_function(func, info, wait);
-        local_irq_disable();
+        local_irq_save(flags);
        func(info);
-        local_irq_enable();
+        local_irq_restore(flags);
        preempt_enable();
        return ret;
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b88..92b6e1e12d9 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 * Stubs:
 */
-void early_boot_irqs_off(void)
-{
-}
-void early_boot_irqs_on(void)
-{
-}
 void trace_softirqs_on(unsigned long ip)
 {
 }