25 files changed, 341 insertions, 220 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..2c9eae6ad970 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,7 +46,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
 */
 static DEFINE_SPINLOCK(release_agent_path_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 #define cgroup_assert_mutex_or_rcu_locked()                             \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
                           !lockdep_is_held(&cgroup_mutex),             \
@@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        return cset;
 }
+void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+        down_read(&tsk->signal->group_rwsem);
+}
+void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+        up_read(&tsk->signal->group_rwsem);
+}
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * change ->group_leader/pid.  This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
+ *
+ * fork and exit explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
+ */
+static void threadgroup_lock(struct task_struct *tsk)
+{
+        down_write(&tsk->signal->group_rwsem);
+}
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+        up_write(&tsk->signal->group_rwsem);
+}
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
        struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
        lockdep_assert_held(&css_set_rwsem);
        /*
-         * We are synchronized through cgroup_threadgroup_rwsem against
+         * We are synchronized through threadgroup_lock() against PF_EXITING
-         * PF_EXITING setting such that we can't race against cgroup_exit()
+         * setting such that we can't race against cgroup_exit() changing the
-         * changing the css_set to init_css_set and dropping the old one.
+         * css_set to init_css_set and dropping the old one.
         */
        WARN_ON_ONCE(tsk->flags & PF_EXITING);
        old_cset = task_css_set(tsk);
@@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 * @src_cset and add it to @preloaded_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
- * This function may be called without holding cgroup_threadgroup_rwsem
+ * This function may be called without holding threadgroup_lock even if the
- * even if the target is a process.  Threads may be created and destroyed
+ * target is a process.  Threads may be created and destroyed but as long
- * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * into play and the preloaded css_sets are guaranteed to cover all
+ * the preloaded css_sets are guaranteed to cover all migrations.
- * migrations.
 */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
                                   struct cgroup *dst_cgrp,
@@ -2240,7 +2278,7 @@ err:
 * @threadgroup: whether @leader points to the whole process or a single task
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
+ * process, the caller must be holding threadgroup_lock of @leader.  The
 * caller is also responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
@@ -2368,7 +2406,7 @@ out_release_tset:
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
 */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
                              struct task_struct *leader, bool threadgroup)
@@ -2460,13 +2498,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
        if (!cgrp)
                return -ENODEV;
-        percpu_down_write(&cgroup_threadgroup_rwsem);
+retry_find_task:
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
+                        rcu_read_unlock();
                        ret = -ESRCH;
-                        goto out_unlock_rcu;
+                        goto out_unlock_cgroup;
                }
        } else {
                tsk = current;
@@ -2482,23 +2521,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
         */
        if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
-                goto out_unlock_rcu;
+                rcu_read_unlock();
+                goto out_unlock_cgroup;
        }
        get_task_struct(tsk);
        rcu_read_unlock();
+        threadgroup_lock(tsk);
+        if (threadgroup) {
+                if (!thread_group_leader(tsk)) {
+                        /*
+                         * a race with de_thread from another thread's exec()
+                         * may strip us of our leadership, if this happens,
+                         * there is no choice but to throw this task away and
+                         * try again; this is
+                         * "double-double-toil-and-trouble-check locking".
+                         */
+                        threadgroup_unlock(tsk);
+                        put_task_struct(tsk);
+                        goto retry_find_task;
+                }
+        }
        ret = cgroup_procs_write_permission(tsk, cgrp, of);
        if (!ret)
                ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-        put_task_struct(tsk);
+        threadgroup_unlock(tsk);
-        goto out_unlock_threadgroup;
-out_unlock_rcu:
+        put_task_struct(tsk);
-        rcu_read_unlock();
+out_unlock_cgroup:
-out_unlock_threadgroup:
-        percpu_up_write(&cgroup_threadgroup_rwsem);
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
 }
@@ -2643,8 +2696,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        lockdep_assert_held(&cgroup_mutex);
-        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* look up all csses currently attached to @cgrp's subtree */
        down_read(&css_set_rwsem);
        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2700,8 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                                goto out_finish;
                        last_task = task;
+                        threadgroup_lock(task);
+                        /* raced against de_thread() from another thread? */
+                        if (!thread_group_leader(task)) {
+                                threadgroup_unlock(task);
+                                put_task_struct(task);
+                                continue;
+                        }
                        ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+                        threadgroup_unlock(task);
                        put_task_struct(task);
                        if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2711,7 +2771,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 out_finish:
        cgroup_migrate_finish(&preloaded_csets);
-        percpu_up_write(&cgroup_threadgroup_rwsem);
        return ret;
 }
@@ -5024,7 +5083,6 @@ int __init cgroup_init(void)
        unsigned long key;
        int ssid, err;
-        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..b11756f9b6dc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
                                              PERF_EVENT_STATE_INACTIVE;
 }
-/*
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
 {
        int entry = sizeof(u64); /* value */
        int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
                entry += sizeof(u64);
        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
+                nr += nr_siblings;
                size += sizeof(u64);
        }
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
        event->read_size = size;
 }
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 {
        struct perf_sample_data *data;
-        u64 sample_type = event->attr.sample_type;
        u16 size = 0;
-        perf_event__read_size(event);
        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
        event->header_size = size;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+        __perf_event_read_size(event,
+                               event->group_leader->nr_siblings);
+        __perf_event_header_size(event, event->attr.sample_type);
+}
 static void perf_event__id_header_size(struct perf_event *event)
 {
        struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
        event->id_header_size = size;
 }
+static bool perf_event_validate_size(struct perf_event *event)
+{
+        /*
+         * The values computed here will be over-written when we actually
+         * attach the event.
+         */
+        __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+        __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+        perf_event__id_header_size(event);
+        /*
+         * Sum the lot; should not exceed the 64k limit we have on records.
+         * Conservative limit to allow for callchains and other variable fields.
+         */
+        if (event->read_size + event->header_size +
+            event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+                return false;
+        return true;
+}
 static void perf_group_attach(struct perf_event *event)
 {
        struct perf_event *group_leader = event->group_leader, *pos;
@@ -8297,13 +8322,35 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group) {
                gctx = group_leader->ctx;
+                mutex_lock_double(&gctx->mutex, &ctx->mutex);
+        } else {
+                mutex_lock(&ctx->mutex);
+        }
+        if (!perf_event_validate_size(event)) {
+                err = -E2BIG;
+                goto err_locked;
+        }
+        /*
+         * Must be under the same ctx::mutex as perf_install_in_context(),
+         * because we need to serialize with concurrent event creation.
+         */
+        if (!exclusive_event_installable(event, ctx)) {
+                /* exclusive and group stuff are assumed mutually exclusive */
+                WARN_ON_ONCE(move_group);
+                err = -EBUSY;
+                goto err_locked;
+        }
+        WARN_ON_ONCE(ctx->parent_ctx);
+        if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
-                mutex_lock_double(&gctx->mutex, &ctx->mutex);
                perf_remove_from_context(group_leader, false);
                list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8358,7 @@ SYSCALL_DEFINE5(perf_event_open,
                        perf_remove_from_context(sibling, false);
                        put_ctx(gctx);
                }
-        } else {
-                mutex_lock(&ctx->mutex);
-        }
-        WARN_ON_ONCE(ctx->parent_ctx);
-        if (move_group) {
                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
@@ -8349,22 +8390,29 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
-        }
-        if (!exclusive_event_installable(event, ctx)) {
+                /*
-                err = -EBUSY;
+                 * Now that all events are installed in @ctx, nothing
-                mutex_unlock(&ctx->mutex);
+                 * references @gctx anymore, so drop the last reference we have
-                fput(event_file);
+                 * on it.
-                goto err_context;
+                 */
+                put_ctx(gctx);
        }
+        /*
+         * Precalculate sample_data sizes; do while holding ctx::mutex such
+         * that we're serialized against further additions and before
+         * perf_install_in_context() which is the point the event is active and
+         * can use these values.
+         */
+        perf_event__header_size(event);
+        perf_event__id_header_size(event);
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
-        if (move_group) {
+        if (move_group)
                mutex_unlock(&gctx->mutex);
-                put_ctx(gctx);
-        }
        mutex_unlock(&ctx->mutex);
        put_online_cpus();
@@ -8376,12 +8424,6 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&current->perf_event_mutex);
        /*
-         * Precalculate sample_data sizes
-         */
-        perf_event__header_size(event);
-        perf_event__id_header_size(event);
-        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
@@ -8391,6 +8433,12 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
+err_locked:
+        if (move_group)
+                mutex_unlock(&gctx->mutex);
+        mutex_unlock(&ctx->mutex);
+/* err_file: */
+        fput(event_file);
 err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..2845623fb582 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+        init_rwsem(&sig->group_rwsem);
+#endif
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6e40a9539763..e28169dd1c36 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data)
        if (!desc)
                return -EINVAL;
-        desc->irq_data.handler_data = data;
+        desc->irq_common_data.handler_data = data;
        irq_put_desc_unlock(desc, flags);
        return 0;
 }
@@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
        if (!desc)
                return -EINVAL;
-        desc->irq_data.msi_desc = entry;
+        desc->irq_common_data.msi_desc = entry;
        if (entry && !irq_offset)
                entry->irq = irq_base;
        irq_put_desc_unlock(desc, flags);
@@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc)
 /**
 *      handle_simple_irq - Simple and software-decoded IRQs.
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 *      Simple interrupts are either sent from a demultiplexing interrupt
@@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc)
 *      Note: The caller is expected to handle the ack, clear, mask and
 *      unmask issues if necessary.
 */
-void
+void handle_simple_irq(struct irq_desc *desc)
-handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
@@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc)
 /**
 *      handle_level_irq - Level type irq handler
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 *      Level type interrupts are active as long as the hardware line has
@@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc)
 *      it after the associated handler has acknowledged the device, so the
 *      interrupt line is back to inactive.
 */
-void
+void handle_level_irq(struct irq_desc *desc)
-handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
        mask_ack_irq(desc);
@@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
 /**
 *      handle_fasteoi_irq - irq handler for transparent controllers
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 *      Only a single callback will be issued to the chip: an ->eoi()
@@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
 *      for modern forms of interrupt handlers, which handle the flow
 *      details in hardware, transparently.
 */
-void
+void handle_fasteoi_irq(struct irq_desc *desc)
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
        struct irq_chip *chip = desc->irq_data.chip;
@@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 /**
 *      handle_edge_irq - edge type IRQ handler
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 *      Interrupt occures on the falling and/or rising edge of a hardware
@@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 *      the handler was running. If all pending interrupts are handled, the
 *      loop is left.
 */
-void
+void handle_edge_irq(struct irq_desc *desc)
-handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
@@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq);
 #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
 /**
 *      handle_edge_eoi_irq - edge eoi type IRQ handler
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 * Similar as the above handle_edge_irq, but using eoi and w/o the
 * mask/unmask logic.
 */
-void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_eoi_irq(struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
@@ -665,13 +656,11 @@ out_eoi:
 /**
 *      handle_percpu_irq - Per CPU local irq handler
- *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
 *      Per CPU interrupts on SMP machines without locking requirements
 */
-void
+void handle_percpu_irq(struct irq_desc *desc)
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
@@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 /**
 * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq:        the interrupt number
 * @desc:       the interrupt description structure for this irq
 *
 * Per CPU interrupts on SMP machines without locking requirements. Same as
@@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 * contain the real device id for the cpu on which this handler is
 * called
 */
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_devid_irq(struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct irqaction *action = desc->action;
        void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+        unsigned int irq = irq_desc_get_irq(desc);
        irqreturn_t res;
        kstat_incr_irqs_this_cpu(desc);
@@ -796,7 +785,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
                return;
        __irq_do_set_handler(desc, handle, 1, NULL);
-        desc->irq_data.handler_data = data;
+        desc->irq_common_data.handler_data = data;
        irq_put_desc_busunlock(desc, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b6eeea8a80c5..e25a83b67cce 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,17 +22,19 @@
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
- * @irq:       the interrupt number
 * @desc:      description of the interrupt
 *
 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
 */
-void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(struct irq_desc *desc)
 {
+        unsigned int irq = irq_desc_get_irq(desc);
        print_irq_desc(irq, desc);
        kstat_incr_irqs_this_cpu(desc);
        ack_bad_irq(irq);
 }
+EXPORT_SYMBOL_GPL(handle_bad_irq);
 /*
 * Special, empty irq handler:
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index eee4b385cffb..5ef0c2dbe930 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 static inline int irq_desc_get_node(struct irq_desc *desc)
 {
-        return irq_data_get_node(&desc->irq_data);
+        return irq_common_data_get_node(&desc->irq_common_data);
 }
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0a2a4b697bcb..239e2ae2c947 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void)
 #ifdef CONFIG_SMP
 static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 {
-        if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+        if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
+                                     gfp, node))
                return -ENOMEM;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
-                free_cpumask_var(desc->irq_data.affinity);
+                free_cpumask_var(desc->irq_common_data.affinity);
                return -ENOMEM;
        }
 #endif
@@ -52,11 +53,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 static void desc_smp_init(struct irq_desc *desc, int node)
 {
-        desc->irq_data.node = node;
+        cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
-        cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        cpumask_clear(desc->pending_mask);
 #endif
+#ifdef CONFIG_NUMA
+        desc->irq_common_data.node = node;
+#endif
 }
 #else
@@ -70,12 +73,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 {
        int cpu;
+        desc->irq_common_data.handler_data = NULL;
+        desc->irq_common_data.msi_desc = NULL;
        desc->irq_data.common = &desc->irq_common_data;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
-        desc->irq_data.handler_data = NULL;
-        desc->irq_data.msi_desc = NULL;
        irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
        desc->handle_irq = handle_bad_irq;
@@ -121,7 +125,7 @@ static void free_masks(struct irq_desc *desc)
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        free_cpumask_var(desc->pending_mask);
 #endif
-        free_cpumask_var(desc->irq_data.affinity);
+        free_cpumask_var(desc->irq_common_data.affinity);
 }
 #else
 static inline void free_masks(struct irq_desc *desc) { }
@@ -343,7 +347,7 @@ int generic_handle_irq(unsigned int irq)
        if (!desc)
                return -EINVAL;
-        generic_handle_irq_desc(irq, desc);
+        generic_handle_irq_desc(desc);
        return 0;
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 79baaf8a7813..dc9d27c0c158 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
                child->parent_data = irq_data;
                irq_data->irq = child->irq;
                irq_data->common = child->common;
-                irq_data->node = child->node;
                irq_data->domain = domain;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ad1b064f94fe..f9a59f6cabd2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        switch (ret) {
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
-                cpumask_copy(data->affinity, mask);
+                cpumask_copy(desc->irq_common_data.affinity, mask);
        case IRQ_SET_MASK_OK_NOCOPY:
                irq_set_thread_affinity(desc);
                ret = 0;
@@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work)
        if (irq_move_pending(&desc->irq_data))
                irq_get_pending(cpumask, desc);
        else
-                cpumask_copy(cpumask, desc->irq_data.affinity);
+                cpumask_copy(cpumask, desc->irq_common_data.affinity);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        notify->notify(notify, cpumask);
@@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
         * one of the targets is online.
         */
        if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
-                if (cpumask_intersects(desc->irq_data.affinity,
+                if (cpumask_intersects(desc->irq_common_data.affinity,
                                       cpu_online_mask))
-                        set = desc->irq_data.affinity;
+                        set = desc->irq_common_data.affinity;
                else
                        irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
        }
@@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
         * This code is triggered unconditionally. Check the affinity
         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
         */
-        if (desc->irq_data.affinity)
+        if (desc->irq_common_data.affinity)
-                cpumask_copy(mask, desc->irq_data.affinity);
+                cpumask_copy(mask, desc->irq_common_data.affinity);
        else
                valid = false;
        raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7e6512b9dc1f..be9149f62eb8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
 {
        struct irq_chip *chip = info->chip;
-        BUG_ON(!chip);
+        BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
-        if (!chip->irq_mask)
-                chip->irq_mask = pci_msi_mask_irq;
-        if (!chip->irq_unmask)
-                chip->irq_unmask = pci_msi_unmask_irq;
        if (!chip->irq_set_affinity)
                chip->irq_set_affinity = msi_domain_set_affinity;
 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0e97c142ce40..a50ddc9417ff 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/mutex.h>
 #include "internals.h"
@@ -39,7 +40,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
-        const struct cpumask *mask = desc->irq_data.affinity;
+        const struct cpumask *mask = desc->irq_common_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (irqd_is_setaffinity_pending(&desc->irq_data))
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
+        static DEFINE_MUTEX(register_lock);
        char name [MAX_NAMELEN];
-        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
                return;
+        /*
+         * irq directories are registered only when a handler is
+         * added, not when the descriptor is created, so multiple
+         * tasks might try to register at the same time.
+         */
+        mutex_lock(&register_lock);
+        if (desc->dir)
+                goto out_unlock;
        memset(name, 0, MAX_NAMELEN);
        sprintf(name, "%d", irq);
        /* create /proc/irq/1234 */
        desc->dir = proc_mkdir(name, root_irq_dir);
        if (!desc->dir)
-                return;
+                goto out_unlock;
 #ifdef CONFIG_SMP
        /* create /proc/irq/<irq>/smp_affinity */
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        proc_create_data("spurious", 0444, desc->dir,
                         &irq_spurious_proc_fops, (void *)(long)irq);
+out_unlock:
+        mutex_unlock(&register_lock);
 }
 void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dd95f44f99b2..b86886beee4f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
                clear_bit(irq, irqs_resend);
                desc = irq_to_desc(irq);
                local_irq_disable();
-                desc->handle_irq(irq, desc);
+                desc->handle_irq(desc);
                local_irq_enable();
        }
 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..4e49cc4c9952 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          int trylock, int read, int check, int hardirqs_off,
                          struct lockdep_map *nest_lock, unsigned long ip,
-                          int references)
+                          int references, int pin_count)
 {
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->waittime_stamp = 0;
        hlock->holdtime_stamp = lockstat_clock();
 #endif
-        hlock->pin_count = 0;
+        hlock->pin_count = pin_count;
        if (check && !mark_irqflags(curr, hlock))
                return 0;
@@ -3343,7 +3343,7 @@ found_it:
                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
                                hlock->nest_lock, hlock->acquire_ip,
-                                hlock->references))
+                                hlock->references, hlock->pin_count))
                        return 0;
        }
@@ -3433,7 +3433,7 @@ found_it:
                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
                                hlock->nest_lock, hlock->acquire_ip,
-                                hlock->references))
+                                hlock->references, hlock->pin_count))
                        return 0;
        }
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        current->lockdep_recursion = 1;
        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
-                       irqs_disabled_flags(flags), nest_lock, ip, 0);
+                       irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 337c8818541d..87e9ce6a63c5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        if (pv_enabled())
                goto queue;
-        if (virt_queued_spin_lock(lock))
+        if (virt_spin_lock(lock))
                return;
        /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9f75f25cc5d9..775d36cc0050 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
+        static struct lock_class_key rcu_exp_sched_rdp_class;
        unsigned long flags;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        mutex_init(&rdp->exp_funnel_mutex);
        rcu_boot_init_nocb_percpu_data(rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        if (rsp == &rcu_sched_state)
+                lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
+                                           &rcu_exp_sched_rdp_class,
+                                           "rcu_data_exp_sched");
 }
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..10a8faa1b0d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -621,18 +621,21 @@ int get_nohz_timer_target(void)
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
-        if (!idle_cpu(cpu))
+        if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
-                        if (!idle_cpu(i)) {
+                        if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
+        if (!is_housekeeping_cpu(cpu))
+                cpu = housekeeping_any_cpu();
 unlock:
        rcu_read_unlock();
        return cpu;
@@ -2514,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
         * schedule one last time. The schedule call will never return, and
         * the scheduled task must drop that reference.
-         * The test for TASK_DEAD must occur while the runqueue locks are
+         *
-         * still held, otherwise prev could be scheduled on another cpu, die
+         * We must observe prev->state before clearing prev->on_cpu (in
-         * there before we look at prev->state, and then the reference would
+         * finish_lock_switch), otherwise a concurrent wakeup can get prev
-         * be dropped twice.
+         * running on another CPU and we could rave with its RUNNING -> DEAD
-         *              Manfred Spraul <manfred@colorfullife.com>
+         * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
@@ -2666,13 +2669,20 @@ unsigned long nr_running(void)
 /*
 * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
 */
 bool single_task_running(void)
 {
-        if (cpu_rq(smp_processor_id())->nr_running == 1)
+        return raw_rq()->nr_running == 1;
-                return true;
-        else
-                return false;
 }
 EXPORT_SYMBOL(single_task_running);
@@ -4924,7 +4934,15 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+        /*
+         * Its possible that init_idle() gets called multiple times on a task,
+         * in that case do_set_cpus_allowed() will not do the right thing.
+         *
+         * And since this is boot we can forgo the serialization.
+         */
+        set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4941,7 +4959,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rq->curr = rq->idle = idle;
        idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        idle->on_cpu = 1;
 #endif
        raw_spin_unlock(&rq->lock);
@@ -4956,7 +4974,7 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
        vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
@@ -5178,24 +5196,47 @@ static void migrate_tasks(struct rq *dead_rq)
                        break;
                /*
-                 * Ensure rq->lock covers the entire task selection
+                 * pick_next_task assumes pinned rq->lock.
-                 * until the migration.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
+                /*
+                 * Rules for changing task_struct::cpus_allowed are holding
+                 * both pi_lock and rq->lock, such that holding either
+                 * stabilizes the mask.
+                 *
+                 * Drop rq->lock is not quite as disastrous as it usually is
+                 * because !cpu_active at this point, which means load-balance
+                 * will not interfere. Also, stop-machine.
+                 */
+                lockdep_unpin_lock(&rq->lock);
+                raw_spin_unlock(&rq->lock);
+                raw_spin_lock(&next->pi_lock);
+                raw_spin_lock(&rq->lock);
+                /*
+                 * Since we're inside stop-machine, _nothing_ should have
+                 * changed the task, WARN if weird stuff happened, because in
+                 * that case the above rq->lock drop is a fail too.
+                 */
+                if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+                        raw_spin_unlock(&next->pi_lock);
+                        continue;
+                }
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-                lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
+                raw_spin_unlock(&next->pi_lock);
        }
        rq->stop = stop;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68cda117574c..6d2a119c7ad9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
+         *
+         * Pairs with the control dependency and rmb in try_to_wake_up().
         */
-        smp_wmb();
+        smp_store_release(&prev->on_cpu, 0);
-        prev->on_cpu = 0;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-                          void *key)
 {
-        __wake_up_common(q, mode, nr, 0, key);
+        __wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, 1, key);
+                __wake_up_locked_key(q, mode, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 50eb107f1198..a9b76a40319e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 static int __clockevents_switch_state(struct clock_event_device *dev,
                                      enum clock_event_state state)
 {
-        /* Transition with legacy set_mode() callback */
-        if (dev->set_mode) {
-                /* Legacy callback doesn't support new modes */
-                if (state > CLOCK_EVT_STATE_ONESHOT)
-                        return -ENOSYS;
-                /*
-                 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
-                 * mapping until *_ONESHOT, and so a simple cast will work.
-                 */
-                dev->set_mode((enum clock_event_mode)state, dev);
-                dev->mode = (enum clock_event_mode)state;
-                return 0;
-        }
        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
                return 0;
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 {
        int ret = 0;
-        if (dev->set_mode) {
+        if (dev->tick_resume)
-                dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
-                dev->mode = CLOCK_EVT_MODE_RESUME;
-        } else if (dev->tick_resume) {
                ret = dev->tick_resume(dev);
-        }
        return ret;
 }
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind_device);
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
-        /* Legacy set_mode() callback */
-        if (dev->set_mode) {
-                /* We shouldn't be supporting new modes now */
-                WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-                        dev->set_state_shutdown || dev->tick_resume ||
-                        dev->set_state_oneshot_stopped);
-                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-                return 0;
-        }
-        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
-                return 0;
-        return 0;
-}
 /**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
-        BUG_ON(clockevents_sanity_check(dev));
        /* Initialize state to DETACHED */
        clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 841b72f720e8..3a38775b50c2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                /* Check the deviation from the watchdog clocksource. */
-                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+                if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
                                cs->name);
                        pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d11c55b6ab7d..4fcd99e12aa0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
                 * the set mode function!
                 */
                clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
-                dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3319e16f31e5..7c7ec4515983 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
 __setup("nohz_full=", tick_nohz_full_setup);
 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-                                                 unsigned long action,
+                                       unsigned long action,
-                                                 void *hcpu)
+                                       void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                /*
-                 * If we handle the timekeeping duty for full dynticks CPUs,
+                 * The boot CPU handles housekeeping duty (unbound timers,
-                 * we can't safely shutdown that CPU.
+                 * workqueues, timekeeping, ...) on behalf of full dynticks
+                 * CPUs. It must remain online when nohz full is enabled.
                 */
                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
                        return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
+        /*
+         * We need at least one CPU to handle housekeeping work such
+         * as timekeeping, unbound timers, workqueues, ...
+         */
+        WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f6ee2e6b6f5d..44d2cc0436f4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)
        set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
        tk_set_wall_to_mono(tk, tmp);
-        timekeeping_update(tk, TK_MIRROR);
+        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
        negative = (tick_error < 0);
        /* Sort out the magnitude of the correction */
-        tick_error = abs(tick_error);
+        tick_error = abs64(tick_error);
        for (adj = 0; tick_error > interval; adj++)
                tick_error >>= 1;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 129c96033e46..f75e35b60149 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                   (unsigned long long) dev->min_delta_ns);
        SEQ_printf(m, " mult:           %u\n", dev->mult);
        SEQ_printf(m, " shift:          %u\n", dev->shift);
-        SEQ_printf(m, " mode:           %d\n", dev->mode);
+        SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
-        if (dev->set_mode) {
+        if (dev->set_state_shutdown) {
-                SEQ_printf(m, " set_mode:       ");
+                SEQ_printf(m, " shutdown: ");
-                print_name_offset(m, dev->set_mode);
+                print_name_offset(m, dev->set_state_shutdown);
                SEQ_printf(m, "\n");
-        } else {
+        }
-                if (dev->set_state_shutdown) {
-                        SEQ_printf(m, " shutdown: ");
-                        print_name_offset(m, dev->set_state_shutdown);
-                        SEQ_printf(m, "\n");
-                }
-                if (dev->set_state_periodic) {
+        if (dev->set_state_periodic) {
-                        SEQ_printf(m, " periodic: ");
+                SEQ_printf(m, " periodic: ");
-                        print_name_offset(m, dev->set_state_periodic);
+                print_name_offset(m, dev->set_state_periodic);
-                        SEQ_printf(m, "\n");
+                SEQ_printf(m, "\n");
-                }
+        }
-                if (dev->set_state_oneshot) {
+        if (dev->set_state_oneshot) {
-                        SEQ_printf(m, " oneshot:  ");
+                SEQ_printf(m, " oneshot:  ");
-                        print_name_offset(m, dev->set_state_oneshot);
+                print_name_offset(m, dev->set_state_oneshot);
-                        SEQ_printf(m, "\n");
+                SEQ_printf(m, "\n");
-                }
+        }
-                if (dev->set_state_oneshot_stopped) {
+        if (dev->set_state_oneshot_stopped) {
-                        SEQ_printf(m, " oneshot stopped: ");
+                SEQ_printf(m, " oneshot stopped: ");
-                        print_name_offset(m, dev->set_state_oneshot_stopped);
+                print_name_offset(m, dev->set_state_oneshot_stopped);
-                        SEQ_printf(m, "\n");
+                SEQ_printf(m, "\n");
-                }
+        }
-                if (dev->tick_resume) {
+        if (dev->tick_resume) {
-                        SEQ_printf(m, " resume:   ");
+                SEQ_printf(m, " resume:   ");
-                        print_name_offset(m, dev->tick_resume);
+                print_name_offset(m, dev->tick_resume);
-                        SEQ_printf(m, "\n");
+                SEQ_printf(m, "\n");
-                }
        }
        SEQ_printf(m, " event_handler:  ");
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca71582fcfab..bcb14cafe007 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
        timer_stats_timer_set_start_info(&dwork->timer);
        dwork->wq = wq;
+        /* timer isn't guaranteed to run in this cpu, record earlier */
+        if (cpu == WORK_CPU_UNBOUND)
+                cpu = raw_smp_processor_id();
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
-        if (unlikely(cpu != WORK_CPU_UNBOUND))
+        add_timer_on(timer, cpu);
-                add_timer_on(timer, cpu);
-        else
-                add_timer(timer);
 }
 /**