41 files changed, 525 insertions, 345 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c2893602d06..47845c57eb19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
                if ((task_active_pid_ns(current) != &init_pid_ns))
                        return -EPERM;
-                if (!capable(CAP_AUDIT_CONTROL))
+                if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-                if (!capable(CAP_AUDIT_WRITE))
+                if (!netlink_capable(skb, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
        default:  /* bad msg */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..3f1ca934a237 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -348,7 +348,7 @@ struct cgrp_cset_link {
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
        .refcount               = ATOMIC_INIT(1),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
@@ -1495,7 +1495,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         */
        if (!use_task_css_set_links)
                cgroup_enable_task_cg_lists();
-retry:
        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
@@ -1503,7 +1503,7 @@ retry:
        ret = parse_cgroupfs_options(data, &opts);
        if (ret)
                goto out_unlock;
+retry:
        /* look for a matching existing root */
        if (!opts.subsys_mask && !opts.none && !opts.name) {
                cgrp_dfl_root_visible = true;
@@ -1562,9 +1562,9 @@ retry:
                if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
                        mutex_unlock(&cgroup_tree_mutex);
-                        kfree(opts.release_agent);
-                        kfree(opts.name);
                        msleep(10);
+                        mutex_lock(&cgroup_tree_mutex);
+                        mutex_lock(&cgroup_mutex);
                        goto retry;
                }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..345628c78b5b 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 /*
 * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
 struct freezer {
        struct cgroup_subsys_state      css;
        unsigned int                    state;
-        spinlock_t                      lock;
 };
+static DEFINE_MUTEX(freezer_mutex);
 static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 {
        return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
        if (!freezer)
                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&freezer->lock);
        return &freezer->css;
 }
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
        struct freezer *freezer = css_freezer(css);
        struct freezer *parent = parent_freezer(freezer);
-        /*
+        mutex_lock(&freezer_mutex);
-         * The following double locking and freezing state inheritance
-         * guarantee that @cgroup can never escape ancestors' freezing
-         * states.  See css_for_each_descendant_pre() for details.
-         */
-        if (parent)
-                spin_lock_irq(&parent->lock);
-        spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
        freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
                atomic_inc(&system_freezing_cnt);
        }
-        spin_unlock(&freezer->lock);
+        mutex_unlock(&freezer_mutex);
-        if (parent)
-                spin_unlock_irq(&parent->lock);
        return 0;
 }
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
 {
        struct freezer *freezer = css_freezer(css);
-        spin_lock_irq(&freezer->lock);
+        mutex_lock(&freezer_mutex);
        if (freezer->state & CGROUP_FREEZING)
                atomic_dec(&system_freezing_cnt);
        freezer->state = 0;
-        spin_unlock_irq(&freezer->lock);
+        mutex_unlock(&freezer_mutex);
 }
 static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
        struct task_struct *task;
        bool clear_frozen = false;
-        spin_lock_irq(&freezer->lock);
+        mutex_lock(&freezer_mutex);
        /*
         * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
                }
        }
-        spin_unlock_irq(&freezer->lock);
+        /* propagate FROZEN clearing upwards */
-        /*
-         * Propagate FROZEN clearing upwards.  We may race with
-         * update_if_frozen(), but as long as both work bottom-up, either
-         * update_if_frozen() sees child's FROZEN cleared or we clear the
-         * parent's FROZEN later.  No parent w/ !FROZEN children can be
-         * left FROZEN.
-         */
        while (clear_frozen && (freezer = parent_freezer(freezer))) {
-                spin_lock_irq(&freezer->lock);
                freezer->state &= ~CGROUP_FROZEN;
                clear_frozen = freezer->state & CGROUP_FREEZING;
-                spin_unlock_irq(&freezer->lock);
        }
+        mutex_unlock(&freezer_mutex);
 }
 /**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
 {
        struct freezer *freezer;
-        rcu_read_lock();
-        freezer = task_freezer(task);
        /*
         * The root cgroup is non-freezable, so we can skip locking the
         * freezer.  This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
         * to do.  If we lost and root is the new cgroup, noop is still the
         * right thing to do.
         */
-        if (!parent_freezer(freezer))
+        if (task_css_is_root(task, freezer_cgrp_id))
-                goto out;
+                return;
-        /*
+        mutex_lock(&freezer_mutex);
-         * Grab @freezer->lock and freeze @task after verifying @task still
+        rcu_read_lock();
-         * belongs to @freezer and it's freezing.  The former is for the
-         * case where we have raced against task migration and lost and
+        freezer = task_freezer(task);
-         * @task is already in a different cgroup which may not be frozen.
+        if (freezer->state & CGROUP_FREEZING)
-         * This isn't strictly necessary as freeze_task() is allowed to be
-         * called spuriously but let's do it anyway for, if nothing else,
-         * documentation.
-         */
-        spin_lock_irq(&freezer->lock);
-        if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
                freeze_task(task);
-        spin_unlock_irq(&freezer->lock);
-out:
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
 }
 /**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
        struct css_task_iter it;
        struct task_struct *task;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        lockdep_assert_held(&freezer_mutex);
-        spin_lock_irq(&freezer->lock);
        if (!(freezer->state & CGROUP_FREEZING) ||
            (freezer->state & CGROUP_FROZEN))
-                goto out_unlock;
+                return;
        /* are all (live) children frozen? */
+        rcu_read_lock();
        css_for_each_child(pos, css) {
                struct freezer *child = css_freezer(pos);
                if ((child->state & CGROUP_FREEZER_ONLINE) &&
-                    !(child->state & CGROUP_FROZEN))
+                    !(child->state & CGROUP_FROZEN)) {
-                        goto out_unlock;
+                        rcu_read_unlock();
+                        return;
+                }
        }
+        rcu_read_unlock();
        /* are all tasks frozen? */
        css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
        freezer->state |= CGROUP_FROZEN;
 out_iter_end:
        css_task_iter_end(&it);
-out_unlock:
-        spin_unlock_irq(&freezer->lock);
 }
 static int freezer_read(struct seq_file *m, void *v)
 {
        struct cgroup_subsys_state *css = seq_css(m), *pos;
+        mutex_lock(&freezer_mutex);
        rcu_read_lock();
        /* update states bottom-up */
-        css_for_each_descendant_post(pos, css)
+        css_for_each_descendant_post(pos, css) {
+                if (!css_tryget(pos))
+                        continue;
+                rcu_read_unlock();
                update_if_frozen(pos);
+                rcu_read_lock();
+                css_put(pos);
+        }
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
        seq_puts(m, freezer_state_strs(css_freezer(css)->state));
        seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
                                unsigned int state)
 {
        /* also synchronizes against task migration, see freezer_attach() */
-        lockdep_assert_held(&freezer->lock);
+        lockdep_assert_held(&freezer_mutex);
        if (!(freezer->state & CGROUP_FREEZER_ONLINE))
                return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
         * descendant will try to inherit its parent's FREEZING state as
         * CGROUP_FREEZING_PARENT.
         */
+        mutex_lock(&freezer_mutex);
        rcu_read_lock();
        css_for_each_descendant_pre(pos, &freezer->css) {
                struct freezer *pos_f = css_freezer(pos);
                struct freezer *parent = parent_freezer(pos_f);
-                spin_lock_irq(&pos_f->lock);
+                if (!css_tryget(pos))
+                        continue;
+                rcu_read_unlock();
-                if (pos_f == freezer) {
+                if (pos_f == freezer)
                        freezer_apply_state(pos_f, freeze,
                                            CGROUP_FREEZING_SELF);
-                } else {
+                else
-                        /*
-                         * Our update to @parent->state is already visible
-                         * which is all we need.  No need to lock @parent.
-                         * For more info on synchronization, see
-                         * freezer_post_create().
-                         */
                        freezer_apply_state(pos_f,
                                            parent->state & CGROUP_FREEZING,
                                            CGROUP_FREEZING_PARENT);
-                }
-                spin_unlock_irq(&pos_f->lock);
+                rcu_read_lock();
+                css_put(pos);
        }
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
 }
 static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee0..019d45008448 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
 * instead of preempt_schedule() to exit user context if needed before
 * calling the scheduler.
 */
-asmlinkage void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 {
        enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..247979a1b815 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present)
 void set_cpu_online(unsigned int cpu, bool online)
 {
-        if (online)
+        if (online) {
                cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
-        else
+                cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+        } else {
                cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+        }
 }
 void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..440eefc67397 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1443,6 +1443,11 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
+struct remove_event {
+        struct perf_event *event;
+        bool detach_group;
+};
 /*
 * Cross CPU call to remove a performance event
 *
@@ -1451,12 +1456,15 @@ group_sched_out(struct perf_event *group_event,
 */
 static int __perf_remove_from_context(void *info)
 {
-        struct perf_event *event = info;
+        struct remove_event *re = info;
+        struct perf_event *event = re->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
+        if (re->detach_group)
+                perf_group_detach(event);
        list_del_event(event, ctx);
        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                ctx->is_active = 0;
@@ -1481,10 +1489,14 @@ static int __perf_remove_from_context(void *info)
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+        struct remove_event re = {
+                .event = event,
+                .detach_group = detach_group,
+        };
        lockdep_assert_held(&ctx->mutex);
@@ -1493,12 +1505,12 @@ static void perf_remove_from_context(struct perf_event *event)
                 * Per cpu events are removed via an smp call and
                 * the removal is always successful.
                 */
-                cpu_function_call(event->cpu, __perf_remove_from_context, event);
+                cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                return;
        }
 retry:
-        if (!task_function_call(task, __perf_remove_from_context, event))
+        if (!task_function_call(task, __perf_remove_from_context, &re))
                return;
        raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1527,8 @@ retry:
         * Since the task isn't running, its safe to remove the event, us
         * holding the ctx->lock ensures the task won't get scheduled in.
         */
+        if (detach_group)
+                perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -3178,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head)
 }
 static void ring_buffer_put(struct ring_buffer *rb);
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+                               struct ring_buffer *rb);
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
@@ -3238,8 +3253,6 @@ static void free_event(struct perf_event *event)
        unaccount_event(event);
        if (event->rb) {
-                struct ring_buffer *rb;
                /*
                 * Can happen when we close an event with re-directed output.
                 *
@@ -3247,12 +3260,7 @@ static void free_event(struct perf_event *event)
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
-                rb = event->rb;
+                ring_buffer_attach(event, NULL);
-                if (rb) {
-                        rcu_assign_pointer(event->rb, NULL);
-                        ring_buffer_detach(event, rb);
-                        ring_buffer_put(rb); /* could be last */
-                }
                mutex_unlock(&event->mmap_mutex);
        }
@@ -3281,10 +3289,7 @@ int perf_event_release_kernel(struct perf_event *event)
         *     to trigger the AB-BA case.
         */
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-        raw_spin_lock_irq(&ctx->lock);
+        perf_remove_from_context(event, true);
-        perf_group_detach(event);
-        raw_spin_unlock_irq(&ctx->lock);
-        perf_remove_from_context(event);
        mutex_unlock(&ctx->mutex);
        free_event(event);
@@ -3839,28 +3844,47 @@ unlock:
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb)
 {
+        struct ring_buffer *old_rb = NULL;
        unsigned long flags;
-        if (!list_empty(&event->rb_entry))
+        if (event->rb) {
-                return;
+                /*
+                 * Should be impossible, we set this when removing
+                 * event->rb_entry and wait/clear when adding event->rb_entry.
+                 */
+                WARN_ON_ONCE(event->rcu_pending);
-        spin_lock_irqsave(&rb->event_lock, flags);
+                old_rb = event->rb;
-        if (list_empty(&event->rb_entry))
+                event->rcu_batches = get_state_synchronize_rcu();
-                list_add(&event->rb_entry, &rb->event_list);
+                event->rcu_pending = 1;
-        spin_unlock_irqrestore(&rb->event_lock, flags);
-}
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
+                spin_lock_irqsave(&old_rb->event_lock, flags);
-{
+                list_del_rcu(&event->rb_entry);
-        unsigned long flags;
+                spin_unlock_irqrestore(&old_rb->event_lock, flags);
+        }
-        if (list_empty(&event->rb_entry))
+        if (event->rcu_pending && rb) {
-                return;
+                cond_synchronize_rcu(event->rcu_batches);
+                event->rcu_pending = 0;
+        }
+        if (rb) {
+                spin_lock_irqsave(&rb->event_lock, flags);
+                list_add_rcu(&event->rb_entry, &rb->event_list);
+                spin_unlock_irqrestore(&rb->event_lock, flags);
+        }
+        rcu_assign_pointer(event->rb, rb);
-        spin_lock_irqsave(&rb->event_lock, flags);
+        if (old_rb) {
-        list_del_init(&event->rb_entry);
+                ring_buffer_put(old_rb);
-        wake_up_all(&event->waitq);
+                /*
-        spin_unlock_irqrestore(&rb->event_lock, flags);
+                 * Since we detached before setting the new rb, so that we
+                 * could attach the new rb, we could have missed a wakeup.
+                 * Provide it now.
+                 */
+                wake_up_all(&event->waitq);
+        }
 }
 static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        struct ring_buffer *rb = event->rb;
+        struct ring_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
@@ -3937,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        atomic_dec(&rb->mmap_count);
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
-                return;
+                goto out_put;
-        /* Detach current event from the buffer. */
+        ring_buffer_attach(event, NULL);
-        rcu_assign_pointer(event->rb, NULL);
-        ring_buffer_detach(event, rb);
        mutex_unlock(&event->mmap_mutex);
        /* If there's still other mmap()s of this buffer, we're done. */
-        if (atomic_read(&rb->mmap_count)) {
+        if (atomic_read(&rb->mmap_count))
-                ring_buffer_put(rb); /* can't be last */
+                goto out_put;
-                return;
-        }
        /*
         * No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +3998,9 @@ again:
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
-                if (event->rb == rb) {
+                if (event->rb == rb)
-                        rcu_assign_pointer(event->rb, NULL);
+                        ring_buffer_attach(event, NULL);
-                        ring_buffer_detach(event, rb);
-                        ring_buffer_put(rb); /* can't be last, we still have one */
-                }
                mutex_unlock(&event->mmap_mutex);
                put_event(event);
@@ -4007,6 +4025,7 @@ again:
        vma->vm_mm->pinned_vm -= mmap_locked;
        free_uid(mmap_user);
+out_put:
        ring_buffer_put(rb); /* could be last */
 }
@@ -4124,7 +4143,6 @@ again:
        vma->vm_mm->pinned_vm += extra;
        ring_buffer_attach(event, rb);
-        rcu_assign_pointer(event->rb, rb);
        perf_event_init_userpage(event);
        perf_event_update_userpage(event);
@@ -5408,6 +5426,9 @@ struct swevent_htable {
        /* Recursion avoidance in each contexts */
        int                             recursion[PERF_NR_CONTEXTS];
+        /* Keeps track of cpu being initialized/exited */
+        bool                            online;
 };
 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5675,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        hwc->state = !(flags & PERF_EF_START);
        head = find_swevent_head(swhash, event);
-        if (WARN_ON_ONCE(!head))
+        if (!head) {
+                /*
+                 * We can race with cpu hotplug code. Do not
+                 * WARN if the cpu just got unplugged.
+                 */
+                WARN_ON_ONCE(swhash->online);
                return -EINVAL;
+        }
        hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6914,7 +6941,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct ring_buffer *rb = NULL, *old_rb = NULL;
+        struct ring_buffer *rb = NULL;
        int ret = -EINVAL;
        if (!output_event)
@@ -6942,8 +6969,6 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
-        old_rb = event->rb;
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6951,23 +6976,7 @@ set:
                        goto unlock;
        }
-        if (old_rb)
+        ring_buffer_attach(event, rb);
-                ring_buffer_detach(event, old_rb);
-        if (rb)
-                ring_buffer_attach(event, rb);
-        rcu_assign_pointer(event->rb, rb);
-        if (old_rb) {
-                ring_buffer_put(old_rb);
-                /*
-                 * Since we detached before setting the new rb, so that we
-                 * could attach the new rb, we could have missed a wakeup.
-                 * Provide it now.
-                 */
-                wake_up_all(&event->waitq);
-        }
        ret = 0;
 unlock:
@@ -7018,6 +7027,9 @@ SYSCALL_DEFINE5(perf_event_open,
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
+        } else {
+                if (attr.sample_period & (1ULL << 63))
+                        return -EINVAL;
        }
        /*
@@ -7165,7 +7177,7 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_context *gctx = group_leader->ctx;
                mutex_lock(&gctx->mutex);
-                perf_remove_from_context(group_leader);
+                perf_remove_from_context(group_leader, false);
                /*
                 * Removing from the context ends up with disabled
@@ -7175,7 +7187,7 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_remove_from_context(sibling);
+                        perf_remove_from_context(sibling, false);
                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
@@ -7305,7 +7317,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        mutex_lock(&src_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
-                perf_remove_from_context(event);
+                perf_remove_from_context(event, false);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
@@ -7367,13 +7379,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        if (child_event->parent) {
+        perf_remove_from_context(child_event, !!child_event->parent);
-                raw_spin_lock_irq(&child_ctx->lock);
-                perf_group_detach(child_event);
-                raw_spin_unlock_irq(&child_ctx->lock);
-        }
-        perf_remove_from_context(child_event);
        /*
         * It can happen that the parent exits first, and has events
@@ -7724,6 +7730,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent, ctxn);
+        if (!parent_ctx)
+                return 0;
        /*
         * No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7843,7 @@ static void perf_event_init_cpu(int cpu)
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        mutex_lock(&swhash->hlist_mutex);
+        swhash->online = true;
        if (swhash->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
@@ -7857,14 +7866,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 static void __perf_event_exit_context(void *__info)
 {
+        struct remove_event re = { .detach_group = false };
        struct perf_event_context *ctx = __info;
-        struct perf_event *event;
        perf_pmu_rotate_stop(ctx->pmu);
        rcu_read_lock();
-        list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
+        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
-                __perf_remove_from_context(event);
+                __perf_remove_from_context(&re);
        rcu_read_unlock();
 }
@@ -7892,6 +7901,7 @@ static void perf_event_exit_cpu(int cpu)
        perf_event_exit_cpu_context(cpu);
        mutex_lock(&swhash->hlist_mutex);
+        swhash->online = false;
        swevent_hlist_release(swhash);
        mutex_unlock(&swhash->hlist_mutex);
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..81dbe773ce4c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr)
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
-                union futex_key *key, struct futex_pi_state **ps)
+                union futex_key *key, struct futex_pi_state **ps,
+                struct task_struct *task)
 {
        struct futex_pi_state *pi_state = NULL;
        struct futex_q *this, *next;
@@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                        return -EINVAL;
                        }
+                        /*
+                         * Protect against a corrupted uval. If uval
+                         * is 0x80000000 then pid is 0 and the waiter
+                         * bit is set. So the deadlock check in the
+                         * calling code has failed and we did not fall
+                         * into the check above due to !pid.
+                         */
+                        if (task && pi_state->owner == task)
+                                return -EDEADLK;
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -803,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        if (!p)
                return -ESRCH;
+        if (!p->mm) {
+                put_task_struct(p);
+                return -EPERM;
+        }
        /*
         * We need to look at the task state flags to figure out,
         * whether the task is exiting. To protect against the do_exit
@@ -935,7 +951,7 @@ retry:
         * We dont have the lock. Look up the PI state (or create it if
         * we are the first waiter):
         */
-        ret = lookup_pi_state(uval, hb, key, ps);
+        ret = lookup_pi_state(uval, hb, key, ps, task);
        if (unlikely(ret)) {
                switch (ret) {
@@ -1347,7 +1363,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 *
 * Return:
 *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
 * <0 - error
 */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1374,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
-        int ret;
+        int ret, vpid;
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
@@ -1386,11 +1402,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         * the contended case or if set_waiters is 1.  The pi_state is returned
         * in ps in contended cases.
         */
+        vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
                                   set_waiters);
-        if (ret == 1)
+        if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
+                return vpid;
+        }
        return ret;
 }
@@ -1421,7 +1439,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
-        u32 curval2;
        if (requeue_pi) {
                /*
@@ -1509,16 +1526,25 @@ retry_private:
                 * At this point the top_waiter has either taken uaddr2 or is
                 * waiting on it.  If the former, then the pi_state will not
                 * exist yet, look it up one more time to ensure we have a
-                 * reference to it.
+                 * reference to it. If the lock was taken, ret contains the
+                 * vpid of the top waiter task.
                 */
-                if (ret == 1) {
+                if (ret > 0) {
                        WARN_ON(pi_state);
                        drop_count++;
                        task_count++;
-                        ret = get_futex_value_locked(&curval2, uaddr2);
+                        /*
-                        if (!ret)
+                         * If we acquired the lock, then the user
-                                ret = lookup_pi_state(curval2, hb2, &key2,
+                         * space value of uaddr2 should be vpid. It
-                                                      &pi_state);
+                         * cannot be changed by the top waiter as it
+                         * is blocked on hb2 lock if it tries to do
+                         * so. If something fiddled with it behind our
+                         * back the pi state lookup might unearth
+                         * it. So we rather use the known value than
+                         * rereading and handing potential crap to
+                         * lookup_pi_state.
+                         */
+                        ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
                }
                switch (ret) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d55092ceee29..e0501fe7140d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -234,6 +234,11 @@ again:
                        goto again;
                }
                timer->base = new_base;
+        } else {
+                if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+                        cpu = this_cpu;
+                        goto again;
+                }
        }
        return new_base;
 }
@@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
        cpu_base->expires_next.tv64 = expires_next.tv64;
+        /*
+         * If a hang was detected in the last timer interrupt then we
+         * leave the hang delay active in the hardware. We want the
+         * system to make progress. That also prevents the following
+         * scenario:
+         * T1 expires 50ms from now
+         * T2 expires 5s from now
+         *
+         * T1 is removed, so this code is called and would reprogram
+         * the hardware to 5s from now. Any hrtimer_start after that
+         * will not reprogram the hardware due to hang_detected being
+         * set. So we'd effectivly block all timers until the T2 event
+         * fires.
+         */
+        if (cpu_base->hang_detected)
+                return;
        if (cpu_base->expires_next.tv64 != KTIME_MAX)
                tick_program_event(cpu_base->expires_next, 1);
 }
@@ -968,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        /* Remove an active timer from the queue: */
        ret = remove_hrtimer(timer, base);
-        /* Switch the timer base, if necessary: */
-        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
        if (mode & HRTIMER_MODE_REL) {
-                tim = ktime_add_safe(tim, new_base->get_time());
+                tim = ktime_add_safe(tim, base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
                 * to signal that they simply return xtime in
@@ -987,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+        /* Switch the timer base, if necessary: */
+        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
        timer_stats_hrtimer_set_start_info(timer);
        leftmost = enqueue_hrtimer(timer, new_base);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a7174617616b..bb07f2928f4b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -363,6 +363,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
                if (from > irq)
                        return -EINVAL;
                from = irq;
+        } else {
+                /*
+                 * For interrupts which are freely allocated the
+                 * architecture can force a lower bound to the @from
+                 * argument. x86 uses this to exclude the GSI space.
+                 */
+                from = arch_dynirq_lower_bound(from);
        }
        mutex_lock(&sparse_irq_lock);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2486a4c1a710..d34131ca372b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        int ret;
-        ret = chip->irq_set_affinity(data, mask, false);
+        ret = chip->irq_set_affinity(data, mask, force);
        switch (ret) {
        case IRQ_SET_MASK_OK:
                cpumask_copy(data->affinity, mask);
@@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        return ret;
 }
-int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
+                            bool force)
 {
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        struct irq_desc *desc = irq_data_to_desc(data);
@@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
                return -EINVAL;
        if (irq_can_move_pcntxt(data)) {
-                ret = irq_do_set_affinity(data, mask, false);
+                ret = irq_do_set_affinity(data, mask, force);
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
@@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
        return ret;
 }
-/**
+int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
- *      irq_set_affinity - Set the irq affinity of a given irq
- *      @irq:           Interrupt to set affinity
- *      @mask:          cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
                return -EINVAL;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret =  __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
+        ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..28c57069ef68 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1683,6 +1683,14 @@ int kernel_kexec(void)
                kexec_in_progress = true;
                kernel_restart_prepare(NULL);
                migrate_to_reboot_cpu();
+                /*
+                 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                 * no further code needs to use CPU hotplug (which is true in
+                 * the reboot case). However, the kexec path depends on using
+                 * CPU hotplug again; so re-enable it here.
+                 */
+                cpu_hotplug_enable();
                printk(KERN_EMERG "Starting new kernel\n");
                machine_shutdown();
        }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467922e1..d24e4339b46d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
-asmlinkage void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
 {
        struct task_struct *curr = current;
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index e1191c996c59..5cf6731b98e9 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -71,18 +71,17 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 void debug_mutex_unlock(struct mutex *lock)
 {
-        if (unlikely(!debug_locks))
+        if (likely(debug_locks)) {
-                return;
+                DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+                if (!lock->owner)
+                        DEBUG_LOCKS_WARN_ON(!lock->owner);
+                else
+                        DEBUG_LOCKS_WARN_ON(lock->owner != current);
-        if (!lock->owner)
+                DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-                DEBUG_LOCKS_WARN_ON(!lock->owner);
+                mutex_clear_owner(lock);
-        else
+        }
-                DEBUG_LOCKS_WARN_ON(lock->owner != current);
-        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-        mutex_clear_owner(lock);
        /*
         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..a620d4d08ca6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * top_waiter can be NULL, when we are in the deboosting
         * mode!
         */
-        if (top_waiter && (!task_has_pi_waiters(task) ||
+        if (top_waiter) {
-                           top_waiter != task_top_pi_waiter(task)))
+                if (!task_has_pi_waiters(task))
-                goto out_unlock_pi;
+                        goto out_unlock_pi;
+                /*
+                 * If deadlock detection is off, we stop here if we
+                 * are not the top pi waiter of the task.
+                 */
+                if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+                        goto out_unlock_pi;
+        }
        /*
         * When deadlock detection is off then we check, if further
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto retry;
        }
-        /* Deadlock detection */
+        /*
+         * Deadlock detection. If the lock is the same as the original
+         * lock which caused us to walk the lock chain or if the
+         * current lock is owned by the task which initiated the chain
+         * walk, we detected a deadlock.
+         */
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
                raw_spin_unlock(&lock->wait_lock);
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
+        /*
+         * Early deadlock detection. We really don't want the task to
+         * enqueue on itself just to untangle the mess later. It's not
+         * only an optimization. We drop the locks, so another waiter
+         * can come in before the chain walk detects the deadlock. So
+         * the other will detect the deadlock and return -EDEADLOCK,
+         * which is wrong, as the other waiter is not in a deadlock
+         * situation.
+         */
+        if (detect_deadlock && owner == task)
+                return -EDEADLK;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
diff --git a/kernel/module.c b/kernel/module.c
index 11869408f79b..079c4615607d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        if (!(flags & O_NONBLOCK))
-                pr_warn("waiting module removal not supported: please upgrade\n");
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
@@ -3271,6 +3268,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        dynamic_debug_setup(info->debug, info->num_debug);
+        /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+        ftrace_module_init(mod);
        /* Finally it's fully formed, ready to start executing. */
        err = complete_formation(mod, info);
        if (err)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2fb14b..1ea328aafdc9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
        return -ENOMEM;
 }
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c3ad9cafe930..8233cd4047d7 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/syscalls.h>
 #include <linux/gfp.h>
 #include <linux/io.h>
@@ -53,7 +54,9 @@ static void freeze_begin(void)
 static void freeze_enter(void)
 {
+        cpuidle_resume();
        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+        cpuidle_pause();
 }
 void freeze_wake(void)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b50962295..7228258b85ec 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1674,7 +1674,7 @@ EXPORT_SYMBOL(printk_emit);
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
-asmlinkage int printk(const char *fmt, ...)
+asmlinkage __visible int printk(const char *fmt, ...)
 {
        va_list args;
        int r;
@@ -1737,7 +1737,7 @@ void early_vprintk(const char *fmt, va_list ap)
        }
 }
-asmlinkage void early_printk(const char *fmt, ...)
+asmlinkage __visible void early_printk(const char *fmt, ...)
 {
        va_list ap;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..0a7251678982 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2192,7 +2192,7 @@ static inline void post_schedule(struct rq *rq)
 * schedule_tail - first thing a freshly forked thread must call.
 * @prev: the thread we just switched away from.
 */
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
        struct rq *rq = this_rq();
@@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
        if (likely(prev->sched_class == class &&
                   rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq, prev);
-                if (likely(p && p != RETRY_TASK))
+                if (unlikely(p == RETRY_TASK))
-                        return p;
+                        goto again;
+                /* assumes fair_sched_class->next == idle_sched_class */
+                if (unlikely(!p))
+                        p = idle_sched_class.pick_next_task(rq, prev);
+                return p;
        }
 again:
@@ -2741,7 +2747,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
                blk_schedule_flush_plug(tsk);
 }
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
 {
        struct task_struct *tsk = current;
@@ -2751,7 +2757,7 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
 {
        /*
         * If we come here after a random call to set_need_resched(),
@@ -2783,7 +2789,7 @@ void __sched schedule_preempt_disabled(void)
 * off of preempt_enable. Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2813,7 +2819,7 @@ EXPORT_SYMBOL(preempt_schedule);
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
        enum ctx_state prev_state;
@@ -3124,6 +3130,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
        dl_se->dl_throttled = 0;
        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 * We ask for the deadline not being zero, and greater or equal
 * than the runtime, as well as the period of being zero or
 * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
+ * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one.
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
 */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
-        return attr && attr->sched_deadline != 0 &&
+        /* deadline != 0 */
-                (attr->sched_period == 0 ||
+        if (attr->sched_deadline == 0)
-                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+                return false;
-                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
-                attr->sched_runtime >= (2 << (DL_SCALE - 1));
+        /*
+         * Since we truncate DL_SCALE bits, make sure we're at least
+         * that big.
+         */
+        if (attr->sched_runtime < (1ULL << DL_SCALE))
+                return false;
+        /*
+         * Since we use the MSB for wrap-around and sign issues, make
+         * sure it's not set (mind that period can be equal to zero).
+         */
+        if (attr->sched_deadline & (1ULL << 63) ||
+            attr->sched_period & (1ULL << 63))
+                return false;
+        /* runtime <= deadline <= period (if period != 0) */
+        if ((attr->sched_period != 0 &&
+             attr->sched_period < attr->sched_deadline) ||
+            attr->sched_deadline < attr->sched_runtime)
+                return false;
+        return true;
 }
 /*
@@ -3639,6 +3669,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 * sys_sched_setattr - same as above, but with extended sched_attr
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
                               unsigned int, flags)
@@ -3650,8 +3681,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if (!uattr || pid < 0 || flags)
                return -EINVAL;
-        if (sched_copy_attr(uattr, &attr))
+        retval = sched_copy_attr(uattr, &attr);
-                return -EFAULT;
+        if (retval)
+                return retval;
+        if (attr.sched_policy < 0)
+                return -EINVAL;
        rcu_read_lock();
        retval = -ESRCH;
@@ -3701,7 +3736,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
-        struct sched_param lp;
+        struct sched_param lp = { .sched_priority = 0 };
        struct task_struct *p;
        int retval;
@@ -3718,11 +3753,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
-        if (task_has_dl_policy(p)) {
+        if (task_has_rt_policy(p))
-                retval = -EINVAL;
+                lp.sched_priority = p->rt_priority;
-                goto out_unlock;
-        }
-        lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
        /*
@@ -3783,6 +3815,7 @@ err_size:
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
 * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                unsigned int, size, unsigned int, flags)
@@ -5043,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_STARTING:
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -6017,6 +6049,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        ,
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+                .max_newidle_lb_cost    = 0,
+                .next_decay_max_lb_cost = jiffies,
        };
        SD_INIT_NAME(sd, NUMA);
        sd->private = &tl->data;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include "cpudeadline.h"
 static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
 {
        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
-        swap(cp->elements[a], cp->elements[b]);
+        swap(cp->elements[a].cpu, cp->elements[b].cpu);
-        swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+        swap(cp->elements[a].dl , cp->elements[b].dl );
+        swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
 }
 static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
-        old_idx = cp->cpu_to_idx[cpu];
+        old_idx = cp->elements[cpu].idx;
        if (!is_valid) {
                /* remove item */
                if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                cp->elements[old_idx].cpu = new_cpu;
                cp->size--;
-                cp->cpu_to_idx[new_cpu] = old_idx;
+                cp->elements[new_cpu].idx = old_idx;
-                cp->cpu_to_idx[cpu] = IDX_INVALID;
+                cp->elements[cpu].idx = IDX_INVALID;
                while (old_idx > 0 && dl_time_before(
                                cp->elements[parent(old_idx)].dl,
                                cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
                cp->size++;
                cp->elements[cp->size - 1].dl = 0;
                cp->elements[cp->size - 1].cpu = cpu;
-                cp->cpu_to_idx[cpu] = cp->size - 1;
+                cp->elements[cpu].idx = cp->size - 1;
                cpudl_change_key(cp, cp->size - 1, dl);
                cpumask_clear_cpu(cpu, cp->free_cpus);
        } else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
        memset(cp, 0, sizeof(*cp));
        raw_spin_lock_init(&cp->lock);
        cp->size = 0;
-        for (i = 0; i < NR_CPUS; i++)
-                cp->cpu_to_idx[i] = IDX_INVALID;
+        cp->elements = kcalloc(nr_cpu_ids,
-        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+                               sizeof(struct cpudl_item),
+                               GFP_KERNEL);
+        if (!cp->elements)
+                return -ENOMEM;
+        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+                kfree(cp->elements);
                return -ENOMEM;
+        }
+        for_each_possible_cpu(i)
+                cp->elements[i].idx = IDX_INVALID;
        cpumask_setall(cp->free_cpus);
        return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
 */
 void cpudl_cleanup(struct cpudl *cp)
 {
-        /*
+        free_cpumask_var(cp->free_cpus);
-         * nothing to do for the moment
+        kfree(cp->elements);
-         */
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
 #define IDX_INVALID     -1
-struct array_item {
+struct cpudl_item {
        u64 dl;
        int cpu;
+        int idx;
 };
 struct cpudl {
        raw_spinlock_t lock;
        int size;
-        int cpu_to_idx[NR_CPUS];
-        struct array_item elements[NR_CPUS];
        cpumask_var_t free_cpus;
+        struct cpudl_item *elements;
 };
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..8834243abee2 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int idx = 0;
        int task_pri = convert_prio(p->prio);
-        if (task_pri >= MAX_RT_PRIO)
+        BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
-                return 0;
        for (idx = 0; idx < task_pri; idx++) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
                        goto cleanup;
        }
+        cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+        if (!cp->cpu_to_pri)
+                goto cleanup;
        for_each_possible_cpu(i)
                cp->cpu_to_pri[i] = CPUPRI_INVALID;
        return 0;
 cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
 {
        int i;
+        kfree(cp->cpu_to_pri);
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
                free_cpumask_var(cp->pri_to_cpu[i].mask);
 }
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
 struct cpupri {
        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        int               cpu_to_pri[NR_CPUS];
+        int *cpu_to_pri;
 };
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
 * softirq as those do not count in task exec_runtime any more.
 */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq)
+                                         struct rq *rq, int ticks)
 {
-        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+        u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        if (steal_account_process_tick())
                return;
+        cputime *= ticks;
+        scaled *= ticks;
        if (irqtime_account_hi_update()) {
-                cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+                cpustat[CPUTIME_IRQ] += cputime;
        } else if (irqtime_account_si_update()) {
-                cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+                cpustat[CPUTIME_SOFTIRQ] += cputime;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
-                                        CPUTIME_SOFTIRQ);
        } else if (user_tick) {
-                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+                account_user_time(p, cputime, scaled);
        } else if (p == rq->idle) {
-                account_idle_time(cputime_one_jiffy);
+                account_idle_time(cputime);
        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+                account_guest_time(p, cputime, scaled);
        } else {
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
-                                        CPUTIME_SYSTEM);
        }
 }
 static void irqtime_account_idle_ticks(int ticks)
 {
-        int i;
        struct rq *rq = this_rq();
-        for (i = 0; i < ticks; i++)
+        irqtime_account_process_tick(current, 0, rq, ticks);
-                irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void irqtime_account_idle_ticks(int ticks) {}
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq) {}
+                                                struct rq *rq, int nr_ticks) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        if (sched_clock_irqtime) {
-                irqtime_account_process_tick(p, user_tick, rq);
+                irqtime_account_process_tick(p, user_tick, rq, 1);
                return;
        }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef40925525..800e99b99075 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        sched_clock_tick();
        update_rq_clock(rq);
        dl_se->dl_throttled = 0;
+        dl_se->dl_yielded = 0;
        if (p->on_rq) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
                if (task_has_dl_policy(rq->curr))
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
         * We make the task go to sleep until its current deadline by
         * forcing its runtime to zero. This way, update_curr_dl() stops
         * it and the bandwidth timer will wake it up and will give it
-         * new scheduling parameters (thanks to dl_new=1).
+         * new scheduling parameters (thanks to dl_yielded=1).
         */
        if (p->dl.runtime > 0) {
-                rq->curr->dl.dl_new = 1;
+                rq->curr->dl.dl_yielded = 1;
                p->dl.runtime = 0;
        }
        update_curr_dl(rq);
@@ -1021,8 +1022,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
        dl_rq = &rq->dl;
-        if (need_pull_dl_task(rq, prev))
+        if (need_pull_dl_task(rq, prev)) {
                pull_dl_task(rq);
+                /*
+                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
+                 * means a stop task can slip in, in which case we need to
+                 * re-start task selection.
+                 */
+                if (rq->stop && rq->stop->on_rq)
+                        return RETRY_TASK;
+        }
        /*
         * When prev is DL, we may throttle it in put_prev_task().
         * So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b1fa9e..0fdb96de81a5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1497,7 +1497,7 @@ static void task_numa_placement(struct task_struct *p)
        /* If the task is part of a group prevent parallel updates to group stats */
        if (p->numa_group) {
                group_lock = &p->numa_group->lock;
-                spin_lock(group_lock);
+                spin_lock_irq(group_lock);
        }
        /* Find the node with the highest number of faults */
@@ -1572,7 +1572,7 @@ static void task_numa_placement(struct task_struct *p)
                        }
                }
-                spin_unlock(group_lock);
+                spin_unlock_irq(group_lock);
        }
        /* Preferred node as the node with the most faults */
@@ -1677,7 +1677,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        if (!join)
                return;
-        double_lock(&my_grp->lock, &grp->lock);
+        BUG_ON(irqs_disabled());
+        double_lock_irq(&my_grp->lock, &grp->lock);
        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
                my_grp->faults[i] -= p->numa_faults_memory[i];
@@ -1691,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        grp->nr_tasks++;
        spin_unlock(&my_grp->lock);
-        spin_unlock(&grp->lock);
+        spin_unlock_irq(&grp->lock);
        rcu_assign_pointer(p->numa_group, grp);
@@ -1710,14 +1711,14 @@ void task_numa_free(struct task_struct *p)
        void *numa_faults = p->numa_faults_memory;
        if (grp) {
-                spin_lock(&grp->lock);
+                spin_lock_irq(&grp->lock);
                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
                        grp->faults[i] -= p->numa_faults_memory[i];
                grp->total_faults -= p->total_numa_faults;
                list_del(&p->numa_entry);
                grp->nr_tasks--;
-                spin_unlock(&grp->lock);
+                spin_unlock_irq(&grp->lock);
                rcu_assign_pointer(p->numa_group, NULL);
                put_numa_group(grp);
        }
@@ -6652,6 +6653,7 @@ static int idle_balance(struct rq *this_rq)
        int this_cpu = this_rq->cpu;
        idle_enter_fair(this_rq);
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
@@ -6704,14 +6706,16 @@ static int idle_balance(struct rq *this_rq)
        raw_spin_lock(&this_rq->lock);
+        if (curr_cost > this_rq->max_idle_balance_cost)
+                this_rq->max_idle_balance_cost = curr_cost;
        /*
-         * While browsing the domains, we released the rq lock.
+         * While browsing the domains, we released the rq lock, a task could
-         * A task could have be enqueued in the meantime
+         * have been enqueued in the meantime. Since we're not going idle,
+         * pretend we pulled a task.
         */
-        if (this_rq->cfs.h_nr_running && !pulled_task) {
+        if (this_rq->cfs.h_nr_running && !pulled_task)
                pulled_task = 1;
-                goto out;
-        }
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
@@ -6721,13 +6725,11 @@ static int idle_balance(struct rq *this_rq)
                this_rq->next_balance = next_balance;
        }
-        if (curr_cost > this_rq->max_idle_balance_cost)
-                this_rq->max_idle_balance_cost = curr_cost;
 out:
        /* Is there a task of a high priority class? */
        if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
-            (this_rq->dl.dl_nr_running ||
+            ((this_rq->stop && this_rq->stop->on_rq) ||
+             this_rq->dl.dl_nr_running ||
             (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
                pulled_task = -1;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf1618551..bd2267ad404f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1362,10 +1362,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
                pull_rt_task(rq);
                /*
                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
-                 * means a dl task can slip in, in which case we need to
+                 * means a dl or stop task can slip in, in which case we need
-                 * re-start task selection.
+                 * to re-start task selection.
                 */
-                if (unlikely(rq->dl.dl_nr_running))
+                if (unlikely((rq->stop && rq->stop->on_rq) ||
+                             rq->dl.dl_nr_running))
                        return RETRY_TASK;
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9007f28d3a2..456e492a3dca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1385,6 +1385,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
 }
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        spin_lock_irq(l1);
+        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
 static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
 {
        if (l1 > l2)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d8d046c0726a..b35c21503a36 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 {
        struct task_struct *task = current;
        struct pt_regs *regs = task_pt_regs(task);
+        unsigned long args[6];
        sd->nr = syscall_get_nr(task, regs);
        sd->arch = syscall_get_arch();
+        syscall_get_arguments(task, regs, 0, 6, args);
-        /* Unroll syscall_get_args to help gcc on arm. */
+        sd->args[0] = args[0];
-        syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
+        sd->args[1] = args[1];
-        syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]);
+        sd->args[2] = args[2];
-        syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]);
+        sd->args[3] = args[3];
-        syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]);
+        sd->args[4] = args[4];
-        syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]);
+        sd->args[5] = args[5];
-        syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]);
        sd->instruction_pointer = KSTK_EIP(task);
 }
@@ -256,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
                goto free_prog;
        /* Allocate a new seccomp_filter */
+        ret = -ENOMEM;
        filter = kzalloc(sizeof(struct seccomp_filter) +
                         sizeof(struct sock_filter_int) * new_len,
                         GFP_KERNEL|__GFP_NOWARN);
@@ -265,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
        if (ret)
                goto free_filter;
+        kfree(fp);
        atomic_set(&filter->usage, 1);
        filter->len = new_len;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b50990a5bea0..92f24f5e8d52 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
-asmlinkage void __do_softirq(void)
+asmlinkage __visible void __do_softirq(void)
 {
        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        unsigned long old_flags = current->flags;
@@ -299,7 +299,7 @@ restart:
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
 {
        __u32 pending;
        unsigned long flags;
@@ -779,3 +779,8 @@ int __init __weak arch_early_irq_init(void)
 {
        return 0;
 }
+unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
+{
+        return from;
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 015661279b68..0a0608edeb26 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
 bool tick_check_replacement(struct clock_event_device *curdev,
                            struct clock_event_device *newdev)
 {
-        if (tick_check_percpu(curdev, newdev, smp_processor_id()))
+        if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
                return false;
        return tick_check_preferred(curdev, newdev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9f8af69c67ec..6558b7ac112d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)
                /* Keep the tick_next_period variable up to date */
                tick_next_period = ktime_add(last_jiffies_update, tick_period);
+        } else {
+                write_sequnlock(&jiffies_lock);
+                return;
        }
        write_sequnlock(&jiffies_lock);
        update_wall_time();
@@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t next;
-        if (!tick_nohz_active)
+        if (!tick_nohz_enabled)
                return;
        local_irq_disable();
diff --git a/kernel/timer.c b/kernel/timer.c
index 87bd529879c2..3bb01a323b2a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        bit = find_last_bit(&mask, BITS_PER_LONG);
-        mask = (1 << bit) - 1;
+        mask = (1UL << bit) - 1;
        expires_limit = expires_limit & ~(mask);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1fd4b9479210..4a54a25afa2f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4330,16 +4330,11 @@ static void ftrace_init_module(struct module *mod,
        ftrace_process_locs(mod, start, end);
 }
-static int ftrace_module_notify_enter(struct notifier_block *self,
+void ftrace_module_init(struct module *mod)
-                                      unsigned long val, void *data)
 {
-        struct module *mod = data;
+        ftrace_init_module(mod, mod->ftrace_callsites,
+                           mod->ftrace_callsites +
-        if (val == MODULE_STATE_COMING)
+                           mod->num_ftrace_callsites);
-                ftrace_init_module(mod, mod->ftrace_callsites,
-                                   mod->ftrace_callsites +
-                                   mod->num_ftrace_callsites);
-        return 0;
 }
 static int ftrace_module_notify_exit(struct notifier_block *self,
@@ -4353,11 +4348,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
        return 0;
 }
 #else
-static int ftrace_module_notify_enter(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-        return 0;
-}
 static int ftrace_module_notify_exit(struct notifier_block *self,
                                     unsigned long val, void *data)
 {
@@ -4365,11 +4355,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
 }
 #endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_enter_nb = {
-        .notifier_call = ftrace_module_notify_enter,
-        .priority = INT_MAX,    /* Run before anything that can use kprobes */
-};
 struct notifier_block ftrace_module_exit_nb = {
        .notifier_call = ftrace_module_notify_exit,
        .priority = INT_MIN,    /* Run after anything that can remove kprobes */
@@ -4403,10 +4388,6 @@ void __init ftrace_init(void)
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
-        ret = register_module_notifier(&ftrace_module_enter_nb);
-        if (ret)
-                pr_warning("Failed to register trace ftrace module enter notifier\n");
        ret = register_module_notifier(&ftrace_module_exit_nb);
        if (ret)
                pr_warning("Failed to register trace ftrace module exit notifier\n");
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 925f537f07d1..4747b476a030 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec)
                        data->ops->func(data);
                        continue;
                }
-                filter = rcu_dereference(data->filter);
+                filter = rcu_dereference_sched(data->filter);
                if (filter && !filter_match_preds(filter, rec))
                        continue;
                if (data->cmd_ops->post_trigger) {
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b781d2be383..ffd56351b521 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -58,12 +58,16 @@ int ftrace_create_function_files(struct trace_array *tr,
 {
        int ret;
-        /* The top level array uses the "global_ops". */
+        /*
-        if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
+         * The top level array uses the "global_ops", and the files are
-                ret = allocate_ftrace_ops(tr);
+         * created on boot up.
-                if (ret)
+         */
-                        return ret;
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
-        }
+                return 0;
+        ret = allocate_ftrace_ops(tr);
+        if (ret)
+                return ret;
        ftrace_create_filter_files(tr->ops, parent);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 930e51462dc8..c082a7441345 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void)
 static void uprobe_buffer_disable(void)
 {
+        int cpu;
        BUG_ON(!mutex_is_locked(&event_mutex));
        if (--uprobe_buffer_refcnt == 0) {
+                for_each_possible_cpu(cpu)
+                        free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
+                                                             cpu)->buf);
                free_percpu(uprobe_cpu_buffer);
                uprobe_cpu_buffer = NULL;
        }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23cf7212..6620e5837ce2 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
                WARN_ON_ONCE(1);
                return PTR_ERR(old);
        }
-        release_probes(old);
        /*
         * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
        rcu_assign_pointer(tp->funcs, tp_funcs);
        if (!static_key_enabled(&tp->key))
                static_key_slow_inc(&tp->key);
+        release_probes(old);
        return 0;
 }
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
                WARN_ON_ONCE(1);
                return PTR_ERR(old);
        }
-        release_probes(old);
        if (!tp_funcs) {
                /* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
                        static_key_slow_dec(&tp->key);
        }
        rcu_assign_pointer(tp->funcs, tp_funcs);
+        release_probes(old);
        return 0;
 }
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d8f6023fd8d..bf71b4b2d632 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
        /* Find the matching extent */
        extents = map->nr_extents;
-        smp_read_barrier_depends();
+        smp_rmb();
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
@@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
        /* Find the matching extent */
        extents = map->nr_extents;
-        smp_read_barrier_depends();
+        smp_rmb();
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
@@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
        /* Find the matching extent */
        extents = map->nr_extents;
-        smp_read_barrier_depends();
+        smp_rmb();
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
@@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
-         * order and smp_read_barrier_depends() is guaranteed that we
+         * order and smp_rmb() is guaranteed that we don't have crazy
-         * don't have crazy architectures returning stale data.
+         * architectures returning stale data.
-         *
         */
        mutex_lock(&id_map_mutex);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e90089fd78e0..516203e665fc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -138,7 +138,11 @@ static void __touch_watchdog(void)
 void touch_softlockup_watchdog(void)
 {
-        __this_cpu_write(watchdog_touch_ts, 0);
+        /*
+         * Preemption can be enabled.  It doesn't matter which CPU's timestamp
+         * gets zeroed here, so use the raw_ operation.
+         */
+        raw_cpu_write(watchdog_touch_ts, 0);
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..8edc87185427 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
+                /*
+                 * If @pwq is for an unbound wq, its base ref may be put at
+                 * any time due to an attribute change.  Pin @pwq until the
+                 * rescuer is done with it.
+                 */
+                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
        }
@@ -2398,6 +2404,7 @@ static int rescuer_thread(void *__rescuer)
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
+        bool should_stop;
        set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2409,11 +2416,15 @@ static int rescuer_thread(void *__rescuer)
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
-        if (kthread_should_stop()) {
+        /*
-                __set_current_state(TASK_RUNNING);
+         * By the time the rescuer is requested to stop, the workqueue
-                rescuer->task->flags &= ~PF_WQ_WORKER;
+         * shouldn't have any work pending, but @wq->maydays may still have
-                return 0;
+         * pwq(s) queued.  This can happen by non-rescuer workers consuming
-        }
+         * all the work items before the rescuer got to them.  Go through
+         * @wq->maydays processing before acting on should_stop so that the
+         * list is always empty on exit.
+         */
+        should_stop = kthread_should_stop();
        /* see whether any pwq is asking for help */
        spin_lock_irq(&wq_mayday_lock);
@@ -2445,6 +2456,12 @@ repeat:
                process_scheduled_works(rescuer);
                /*
+                 * Put the reference grabbed by send_mayday().  @pool won't
+                 * go away while we're holding its lock.
+                 */
+                put_pwq(pwq);
+                /*
                 * Leave this pool.  If keep_working() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
@@ -2459,6 +2476,12 @@ repeat:
        spin_unlock_irq(&wq_mayday_lock);
+        if (should_stop) {
+                __set_current_state(TASK_RUNNING);
+                rescuer->task->flags &= ~PF_WQ_WORKER;
+                return 0;
+        }
        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
@@ -4100,7 +4123,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
        if (!pwq) {
                pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
                           wq->name);
-                goto out_unlock;
+                mutex_lock(&wq->mutex);
+                goto use_dfl_pwq;
        }
        /*