14 files changed, 227 insertions, 116 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0f3527d6184a..72fcd3069a90 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                mutex_unlock(&cgroup_mutex);
                /*
-                 * Drop the active superblock reference that we took when we
+                 * We want to drop the active superblock reference from the
-                 * created the cgroup
+                 * cgroup creation after all the dentry refs are gone -
+                 * kill_sb gets mighty unhappy otherwise.  Mark
+                 * dentry->d_fsdata with cgroup_diput() to tell
+                 * cgroup_d_release() to call deactivate_super().
                 */
-                deactivate_super(cgrp->root->sb);
+                dentry->d_fsdata = cgroup_diput;
                /*
                 * if we're getting rid of the cgroup, refcount should ensure
@@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d)
        return 1;
 }
+static void cgroup_d_release(struct dentry *dentry)
+{
+        /* did cgroup_diput() tell me to deactivate super? */
+        if (dentry->d_fsdata == cgroup_diput)
+                deactivate_super(dentry->d_sb);
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
                .d_delete = cgroup_delete,
+                .d_release = cgroup_d_release,
        };
        struct inode *inode =
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..f85c0154b333 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event,
        event = event->group_leader;
        perf_event_for_each_child(event, func);
-        func(event);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
        mutex_unlock(&ctx->mutex);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        handle_irq_event(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+                               const struct cpumask *dest, bool force);
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea0c6c2ae6f7..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -142,6 +142,25 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                        bool force)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        int ret;
+        ret = chip->irq_set_affinity(data, mask, false);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                cpumask_copy(data->affinity, mask);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_set_thread_affinity(desc);
+                ret = 0;
+        }
+        return ret;
+}
 int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
        struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
                return -EINVAL;
        if (irq_can_move_pcntxt(data)) {
-                ret = chip->irq_set_affinity(data, mask, false);
+                ret = irq_do_set_affinity(data, mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(data->affinity, mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                        ret = 0;
-                }
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int ret, node = desc->irq_data.node;
+        int node = desc->irq_data.node;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
                if (cpumask_intersects(mask, nodemask))
                        cpumask_and(mask, mask, nodemask);
        }
-        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+        irq_do_set_affinity(&desc->irq_data, mask, false);
-        switch (ret) {
-        case IRQ_SET_MASK_OK:
-                cpumask_copy(desc->irq_data.affinity, mask);
-        case IRQ_SET_MASK_OK_NOCOPY:
-                irq_set_thread_affinity(desc);
-        }
        return 0;
 }
 #else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
+        if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
-                   < nr_cpu_ids)) {
+                irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-                int ret = chip->irq_set_affinity(&desc->irq_data,
-                                                 desc->pending_mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                }
-        }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..c46958e26121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-        NULL
 };
 #undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
 /*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-        unsigned long curr_jiffies = jiffies;
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
        unsigned long load = this_rq->load.weight;
        unsigned long pending_updates;
        /*
-         * Bloody broken means of dealing with nohz, but better than nothing..
+         * bail if there's load or we're actually up-to-date.
-         * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
-         * update and see 0 difference the one time and 2 the next, even though
-         * we ticked at roughtly the same rate.
-         *
-         * Hence we only use this from nohz_idle_balance() and skip this
-         * nonsense when called from the scheduler_tick() since that's
-         * guaranteed a stable rate.
         */
        if (load || curr_jiffies == this_rq->last_load_update_tick)
                return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
 }
 /*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
 * Called from scheduler_tick()
 */
 static void update_cpu_load_active(struct rq *this_rq)
 {
        /*
-         * See the mess in update_idle_cpu_load().
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
         */
        this_rq->last_load_update_tick = jiffies;
        __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
@@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
                atomic_inc(&sg->sgp->ref);
-                if (cpumask_test_cpu(cpu, sg_span))
+                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                               cpumask_first(sg_span) == cpu) {
+                        WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
                        groups = sg;
+                }
                if (!first)
                        first = sg;
@@ -6403,7 +6438,7 @@ static void sched_init_numa(void)
                        return;
                for (j = 0; j < nr_node_ids; j++) {
-                        struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
                        if (!mask)
                                return;
@@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..b2a2d236f27b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
+        u64 total, available, age_stamp, avg;
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        /*
+         * Since we're reading these variables without serialization make sure
+         * we read them once before doing sanity checks on them.
+         */
+        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        avg = ACCESS_ONCE(rq->rt_avg);
+        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < rq->rt_avg)) {
+        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
                available = 0;
        } else {
-                available = total - rq->rt_avg;
+                available = total - avg;
        }
        if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
        power = 0;
-        group = child->groups;
+        if (child->flags & SD_OVERLAP) {
-        do {
+                /*
-                power += group->sgp->power;
+                 * SD_OVERLAP domains cannot assume that child groups
-                group = group->next;
+                 * span the current group.
-        } while (group != child->groups);
+                 */
+                for_each_cpu(cpu, sched_group_cpus(sdg))
+                        power += power_of(cpu);
+        } else  {
+                /*
+                 * !SD_OVERLAP domains can assume that child groups
+                 * span the current group.
+                 */ 
+                group = child->groups;
+                do {
+                        power += group->sgp->power;
+                        group = group->next;
+                } while (group != child->groups);
+        }
        sdg->sgp->power = power;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..2a4e8dffbd6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total++;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
        update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total--;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
        update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
        inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                goto out;
        /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         * will have to sort it out.
         */
        if (curr && unlikely(rt_task(curr)) &&
-            (curr->rt.nr_cpus_allowed < 2 ||
+            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio) &&
-            (p->rt.nr_cpus_allowed > 1)) {
+            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        if (rq->curr->rt.nr_cpus_allowed == 1)
+        if (rq->curr->nr_cpus_allowed == 1)
                return;
-        if (p->rt.nr_cpus_allowed != 1
+        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->rt.nr_cpus_allowed > 1))
+            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (unlikely(!lowest_mask))
                return -1;
-        if (task->rt.nr_cpus_allowed == 1)
+        if (task->nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(rq->cpu != task_cpu(p));
        BUG_ON(task_current(rq, p));
-        BUG_ON(p->rt.nr_cpus_allowed <= 1);
+        BUG_ON(p->nr_cpus_allowed <= 1);
        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1 &&
+            p->nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
         * Only update if the process changes its state from whether it
         * can migrate or not.
         */
-        if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
                return;
        rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
        watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = RR_TIMESLICE;
        /*
-         * Requeue to the end of queue if we are not the only element
+         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * on the queue:
+         * only element on the queue
         */
-        if (p->rt.run_list.prev != p->rt.run_list.next) {
+        for_each_sched_rt_entity(rt_se) {
-                requeue_task_rt(rq, p, 0);
+                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                set_tsk_need_resched(p);
+                        requeue_task_rt(rq, p, 0);
+                        set_tsk_need_resched(p);
+                        return;
+                }
        }
 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
        per_cpu(idle_threads, smp_processor_id()) = current;
 }
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu:        The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
 static inline void idle_init(unsigned int cpu)
 {
        struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
 }
 /**
- * idle_thread_init - Initialize the idle thread for a cpu
+ * idle_threads_init - Initialize idle threads for all cpus
- * @cpu:        The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
 */
 void __init idle_threads_init(void)
 {
-        unsigned int cpu;
+        unsigned int cpu, boot_cpu;
+        boot_cpu = smp_processor_id();
        for_each_possible_cpu(cpu) {
-                if (cpu != smp_processor_id())
+                if (cpu != boot_cpu)
                        idle_init(cpu);
        }
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..f0ec44dcd415 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,27 +1786,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
-static bool vma_flags_mismatch(struct vm_area_struct *vma,
-                               unsigned long required,
-                               unsigned long banned)
-{
-        return (vma->vm_flags & required) != required ||
-                (vma->vm_flags & banned);
-}
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
+        struct vm_area_struct *vma;
        struct file *exe_file;
        struct dentry *dentry;
        int err;
-        /*
-         * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
-         * remain. So perform a quick test first.
-         */
-        if (mm->num_exe_file_vmas)
-                return -EBUSY;
        exe_file = fget(fd);
        if (!exe_file)
                return -EBADF;
@@ -1827,17 +1813,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (err)
                goto exit;
+        down_write(&mm->mmap_sem);
+        /*
+         * Forbid mm->exe_file change if there are mapped other files.
+         */
+        err = -EBUSY;
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+                                                &exe_file->f_path))
+                        goto exit_unlock;
+        }
        /*
         * The symlink can be changed only once, just to disallow arbitrary
         * transitions malicious software might bring in. This means one
         * could make a snapshot over all processes running and monitor
         * /proc/pid/exe changes to notice unusual activity if needed.
         */
-        down_write(&mm->mmap_sem);
+        err = -EPERM;
-        if (likely(!mm->exe_file))
+        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-                set_mm_exe_file(mm, exe_file);
+                goto exit_unlock;
-        else
-                err = -EBUSY;
+        set_mm_exe_file(mm, exe_file);
+exit_unlock:
        up_write(&mm->mmap_sem);
 exit:
@@ -1862,7 +1861,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
        if (opt == PR_SET_MM_EXE_FILE)
                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-        if (addr >= TASK_SIZE)
+        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;
        error = -EINVAL;
@@ -1924,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
                        error = -EFAULT;
                        goto out;
                }
-#ifdef CONFIG_STACK_GROWSUP
-                if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
-#else
-                if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
-#endif
-                        goto out;
                if (opt == PR_SET_MM_START_STACK)
                        mm->start_stack = addr;
                else if (opt == PR_SET_MM_ARG_START)
@@ -1981,12 +1974,22 @@ out:
        up_read(&mm->mmap_sem);
        return error;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return put_user(me->clear_child_tid, tid_addr);
+}
 #else /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
        return -EINVAL;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return -EINVAL;
+}
 #endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2124,6 +2127,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                else
                                        return -EINVAL;
                                break;
+                case PR_GET_TID_ADDRESS:
+                        error = prctl_get_tid_address(me, (int __user **)arg2);
+                        break;
                        default:
                                return -EINVAL;
                        }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
+void clockevents_config(struct clock_event_device *dev, u32 freq)
-                               u32 freq)
 {
        u64 sec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..da70c6db496c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
+        update_cpu_load_nohz();
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
@@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
        return HRTIMER_RESTART;
 }
+static int sched_skew_tick;
+static int __init skew_tick(char *str)
+{
+        get_option(&str, &sched_skew_tick);
+        return 0;
+}
+early_param("skew_tick", skew_tick);
 /**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
@@ -831,6 +842,14 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+        /* Offset the tick to avert xtime_lock contention. */
+        if (sched_skew_tick) {
+                u64 offset = ktime_to_ns(tick_period) >> 1;
+                do_div(offset, num_possible_cpus());
+                offset *= smp_processor_id();
+                hrtimer_add_expires_ns(&ts->sched_timer, offset);
+        }
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
                hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6e46cacf5969..6f46a00a1e8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
        }
        /* Accumulate raw time */
@@ -1077,6 +1078,7 @@ static void update_wall_time(void)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
        }
        timekeeping_update(false);