63 files changed, 3351 insertions, 1727 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2cc593e..a220fdb66568 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
 static void pidlist_free(void *p)
 {
-        if (is_vmalloc_addr(p))
+        kvfree(p);
-                vfree(p);
-        else
-                kfree(p);
 }
 /*
@@ -5040,6 +5037,9 @@ int __init cgroup_init(void)
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }
+                if (ss->bind)
+                        ss->bind(init_css_set.subsys[ssid]);
        }
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 57858cebd6b5..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 #include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
+        /* Give up timekeeping duties */
+        tick_handover_do_timer();
        /* Park the stopper thread */
        kthread_park(current);
        return 0;
@@ -413,10 +416,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
        per_cpu(cpu_dead_idle, cpu) = false;
+        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);
        /* CPU is completely dead: tell everyone.  Too late to complain. */
+        tick_cleanup_dead_cpu(cpu);
        cpu_notify_nofail(CPU_DEAD | mod, hcpu);
        check_for_tasks(cpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f4748d34a..c68f0721df10 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
        cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
+        cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
        dattr = NULL;
        csa = NULL;
+        if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+                goto done;
+        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
                ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                cpumask_copy(doms[0], top_cpuset.effective_cpus);
+                cpumask_and(doms[0], top_cpuset.effective_cpus,
+                                     non_isolated_cpus);
                goto done;
        }
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                 * the corresponding sched domain.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
-                    !is_sched_load_balance(cp))
+                    !(is_sched_load_balance(cp) &&
+                      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
                        continue;
                if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
+                                cpumask_and(dp, dp, non_isolated_cpus);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
@@ -760,6 +768,7 @@ restart:
        BUG_ON(nslot != ndoms);
 done:
+        free_cpumask_var(non_isolated_cpus);
        kfree(csa);
        /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 453ef61311d4..2fabc0627165 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
+        int rctx;
+        rctx = perf_swevent_get_recursion_context();
+        /*
+         * If we 'fail' here, that's OK, it means recursion is already disabled
+         * and we won't recurse 'further'.
+         */
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry)
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }
+        if (rctx >= 0)
+                perf_swevent_put_recursion_context(rctx);
 }
 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e3830e953..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        if (!p)
                return -ESRCH;
-        if (!p->mm) {
+        if (unlikely(p->flags & PF_KTHREAD)) {
                put_task_struct(p);
                return -EPERM;
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
        return -ENOSYS;
 }
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data:       Pointer to interrupt specific data
+ * @on:         Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+        data = data->parent_data;
+        if (data->chip->irq_set_wake)
+                return data->chip->irq_set_wake(data, on);
+        return -ENOSYS;
+}
 #endif
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e691d5..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
 *      Do not use this for shutdown scenarios where you must be sure
 *      that all parts (hardirq and threaded handler) have completed.
 *
+ *      Returns: false if a threaded handler is active.
+ *
 *      This function may be called - with care - from IRQ context.
 */
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (desc)
+        if (desc) {
                __synchronize_hardirq(desc);
+                return !atomic_read(&desc->threads_active);
+        }
+        return true;
 }
 EXPORT_SYMBOL(synchronize_hardirq);
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
+/**
+ *      disable_hardirq - disables an irq and waits for hardirq completion
+ *      @irq: Interrupt to disable
+ *
+ *      Disable the selected interrupt line.  Enables and Disables are
+ *      nested.
+ *      This function waits for any pending hard IRQ handlers for this
+ *      interrupt to complete before returning. If you use this function while
+ *      holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ *      When used to optimistically disable an interrupt from atomic context
+ *      the return value must be checked.
+ *
+ *      Returns: false if a threaded handler is active.
+ *
+ *      This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+        if (!__disable_irq_nosync(irq))
+                return synchronize_hardirq(irq);
+        return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
        switch (desc->depth) {
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        return retval;
 }
+/**
+ *      irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ *      @irq: Interrupt line that is forwarded to a VM
+ *      @which: One of IRQCHIP_STATE_* the caller wants to know about
+ *      @state: a pointer to a boolean where the state is to be storeed
+ *
+ *      This call snapshots the internal irqchip state of an
+ *      interrupt, returning into @state the bit corresponding to
+ *      stage @which
+ *
+ *      This function should be called with preemption disabled if the
+ *      interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+                          bool *state)
+{
+        struct irq_desc *desc;
+        struct irq_data *data;
+        struct irq_chip *chip;
+        unsigned long flags;
+        int err = -EINVAL;
+        desc = irq_get_desc_buslock(irq, &flags, 0);
+        if (!desc)
+                return err;
+        data = irq_desc_get_irq_data(desc);
+        do {
+                chip = irq_data_get_irq_chip(data);
+                if (chip->irq_get_irqchip_state)
+                        break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+                data = data->parent_data;
+#else
+                data = NULL;
+#endif
+        } while (data);
+        if (data)
+                err = chip->irq_get_irqchip_state(data, which, state);
+        irq_put_desc_busunlock(desc, flags);
+        return err;
+}
+/**
+ *      irq_set_irqchip_state - set the state of a forwarded interrupt.
+ *      @irq: Interrupt line that is forwarded to a VM
+ *      @which: State to be restored (one of IRQCHIP_STATE_*)
+ *      @val: Value corresponding to @which
+ *
+ *      This call sets the internal irqchip state of an interrupt,
+ *      depending on the value of @which.
+ *
+ *      This function should be called with preemption disabled if the
+ *      interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+                          bool val)
+{
+        struct irq_desc *desc;
+        struct irq_data *data;
+        struct irq_chip *chip;
+        unsigned long flags;
+        int err = -EINVAL;
+        desc = irq_get_desc_buslock(irq, &flags, 0);
+        if (!desc)
+                return err;
+        data = irq_desc_get_irq_data(desc);
+        do {
+                chip = irq_data_get_irq_chip(data);
+                if (chip->irq_set_irqchip_state)
+                        break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+                data = data->parent_data;
+#else
+                data = NULL;
+#endif
+        } while (data);
+        if (data)
+                err = chip->irq_set_irqchip_state(data, which, val);
+        irq_put_desc_busunlock(desc, flags);
+        return err;
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..474de5cb394d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
        struct msi_desc *desc;
        for_each_msi_entry(desc, dev) {
-                irq_domain_free_irqs(desc->irq, desc->nvec_used);
+                /*
-                desc->irq = 0;
+                 * We might have failed to allocate an MSI early
+                 * enough that there is no IRQ associated to this
+                 * entry. If that's the case, don't do anything.
+                 */
+                if (desc->irq) {
+                        irq_domain_free_irqs(desc->irq, desc->nvec_used);
+                        desc->irq = 0;
+                }
        }
 }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3f9f1d6b4c2e..284e2691e380 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -335,32 +335,20 @@ unlock:
        rcu_read_unlock();
 }
-static int klp_disable_func(struct klp_func *func)
+static void klp_disable_func(struct klp_func *func)
 {
        struct klp_ops *ops;
-        int ret;
-        if (WARN_ON(func->state != KLP_ENABLED))
-                return -EINVAL;
-        if (WARN_ON(!func->old_addr))
+        WARN_ON(func->state != KLP_ENABLED);
-                return -EINVAL;
+        WARN_ON(!func->old_addr);
        ops = klp_find_ops(func->old_addr);
        if (WARN_ON(!ops))
-                return -EINVAL;
+                return;
        if (list_is_singular(&ops->func_stack)) {
-                ret = unregister_ftrace_function(&ops->fops);
+                WARN_ON(unregister_ftrace_function(&ops->fops));
-                if (ret) {
+                WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
-                        pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
-                               func->old_name, ret);
-                        return ret;
-                }
-                ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
-                if (ret)
-                        pr_warn("function unregister succeeded but failed to clear the filter\n");
                list_del_rcu(&func->stack_node);
                list_del(&ops->node);
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func)
        }
        func->state = KLP_DISABLED;
-        return 0;
 }
 static int klp_enable_func(struct klp_func *func)
@@ -432,23 +418,15 @@ err:
        return ret;
 }
-static int klp_disable_object(struct klp_object *obj)
+static void klp_disable_object(struct klp_object *obj)
 {
        struct klp_func *func;
-        int ret;
-        for (func = obj->funcs; func->old_name; func++) {
+        for (func = obj->funcs; func->old_name; func++)
-                if (func->state != KLP_ENABLED)
+                if (func->state == KLP_ENABLED)
-                        continue;
+                        klp_disable_func(func);
-                ret = klp_disable_func(func);
-                if (ret)
-                        return ret;
-        }
        obj->state = KLP_DISABLED;
-        return 0;
 }
 static int klp_enable_object(struct klp_object *obj)
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj)
        for (func = obj->funcs; func->old_name; func++) {
                ret = klp_enable_func(func);
-                if (ret)
+                if (ret) {
-                        goto unregister;
+                        klp_disable_object(obj);
+                        return ret;
+                }
        }
        obj->state = KLP_ENABLED;
        return 0;
-unregister:
-        WARN_ON(klp_disable_object(obj));
-        return ret;
 }
 static int __klp_disable_patch(struct klp_patch *patch)
 {
        struct klp_object *obj;
-        int ret;
        /* enforce stacking: only the last enabled patch can be disabled */
        if (!list_is_last(&patch->list, &klp_patches) &&
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
        pr_notice("disabling patch '%s'\n", patch->mod->name);
        for (obj = patch->objs; obj->funcs; obj++) {
-                if (obj->state != KLP_ENABLED)
+                if (obj->state == KLP_ENABLED)
-                        continue;
+                        klp_disable_object(obj);
-                ret = klp_disable_object(obj);
-                if (ret)
-                        return ret;
        }
        patch->state = KLP_DISABLED;
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
        pr_notice("enabling patch '%s'\n", patch->mod->name);
        for (obj = patch->objs; obj->funcs; obj++) {
-                klp_find_object_module(obj);
                if (!klp_is_object_loaded(obj))
                        continue;
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
 {
        struct module *pmod = patch->mod;
        struct module *mod = obj->mod;
-        int ret;
        if (patch->state == KLP_DISABLED)
                goto disabled;
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
        pr_notice("reverting patch '%s' on unloading module '%s'\n",
                  pmod->name, mod->name);
-        ret = klp_disable_object(obj);
+        klp_disable_object(obj);
-        if (ret)
-                pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
-                        pmod->name, mod->name, ret);
 disabled:
        klp_free_object_loaded(obj);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
        if (!new_class->name)
                return 0;
-        list_for_each_entry(class, &all_lock_classes, lock_entry) {
+        list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
                if (new_class->key - new_class->subclass == class->key)
                        return class->name_version;
                if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        hash_head = classhashentry(key);
        /*
-         * We can walk the hash lockfree, because the hash only
+         * We do an RCU walk of the hash, see lockdep_free_key_range().
-         * grows, and we are careful when adding entries to the end:
         */
-        list_for_each_entry(class, hash_head, hash_entry) {
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return NULL;
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key) {
                        /*
                         * Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
-        unsigned long flags;
+        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
-        raw_local_irq_save(flags);
        if (!graph_lock()) {
-                raw_local_irq_restore(flags);
                return NULL;
        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
         */
-        list_for_each_entry(class, hash_head, hash_entry)
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key)
                        goto out_unlock_set;
+        }
        /*
         * Allocate a new key from the static array, and add it to
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
                if (!debug_locks_off_graph_unlock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
-                raw_local_irq_restore(flags);
                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
                dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        if (verbose(class)) {
                graph_unlock();
-                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                printk("\n");
                dump_stack();
-                raw_local_irq_save(flags);
                if (!graph_lock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
        }
 out_unlock_set:
        graph_unlock();
-        raw_local_irq_restore(flags);
 out_set_class_cache:
        if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        entry->distance = distance;
        entry->trace = *trace;
        /*
-         * Since we never remove from the dependency list, the list can
+         * Both allocation and removal are done under the graph lock; but
-         * be walked lockless by other CPUs, it's only allocation
+         * iteration is under RCU-sched; see look_up_lock_class() and
-         * that must be protected by the spinlock. But this also means
+         * lockdep_free_key_range().
-         * we must make new entries visible only once writes to the
-         * entry become visible - hence the RCU op:
         */
        list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
                else
                        head = &lock->class->locks_before;
-                list_for_each_entry(entry, head, entry) {
+                DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+                list_for_each_entry_rcu(entry, head, entry) {
                        if (!lock_accessed(entry)) {
                                unsigned int cq_depth;
                                mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         * We can walk it lock-free, because entries only get added
         * to the hash:
         */
-        list_for_each_entry(chain, hash_head, entry) {
+        list_for_each_entry_rcu(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        if (unlikely(!debug_locks))
                return;
-        if (subclass)
+        if (subclass) {
+                unsigned long flags;
+                if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+                        return;
+                raw_local_irq_save(flags);
+                current->lockdep_recursion = 1;
                register_lock_class(lock, subclass, 1);
+                current->lockdep_recursion = 0;
+                raw_local_irq_restore(flags);
+        }
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
        return addr >= start && addr < start + size;
 }
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
 void lockdep_free_key_range(void *start, unsigned long size)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        if (within(class->key, start, size))
                                zap_class(class);
                        else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
        if (locked)
                graph_unlock();
        raw_local_irq_restore(flags);
+        /*
+         * Wait for any possible iterators from look_up_lock_class() to pass
+         * before continuing to free the memory they refer to.
+         *
+         * sync_sched() is sufficient because the read-side is IRQ disable.
+         */
+        synchronize_sched();
+        /*
+         * XXX at this point we could return the resources to the pool;
+         * instead we leak them. We would need to change to bitmap allocators
+         * instead of the linear allocators we have now.
+         */
 }
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        int match = 0;
                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                 */
                return;
        }
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /* Wait until the lock holder passes the lock down. */
        arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 static inline
 void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 {
-        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        struct mcs_spinlock *next = READ_ONCE(node->next);
        if (likely(!next)) {
                /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                if (likely(cmpxchg(lock, node, NULL) == node))
                        return;
                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
+                while (!(next = READ_ONCE(node->next)))
                        cpu_relax_lowlatency();
        }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
 }
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
 */
 static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
+        bool ret = true;
        rcu_read_lock();
-        while (owner_running(lock, owner)) {
+        while (lock->owner == owner) {
-                if (need_resched())
+                /*
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking lock->owner still matches owner. If that fails,
+                 * owner might point to freed memory. If it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                if (!owner->on_cpu || need_resched()) {
+                        ret = false;
                        break;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
-        /*
+        return ret;
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
 }
 /*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
                return 0;
        rcu_read_lock();
-        owner = ACCESS_ONCE(lock->owner);
+        owner = READ_ONCE(lock->owner);
        if (owner)
                retval = owner->on_cpu;
        rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                         * As such, when deadlock detection needs to be
                         * performed the optimistic spinning cannot be done.
                         */
-                        if (ACCESS_ONCE(ww->ctx))
+                        if (READ_ONCE(ww->ctx))
                                break;
                }
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                owner = ACCESS_ONCE(lock->owner);
+                owner = READ_ONCE(lock->owner);
                if (owner && !mutex_spin_on_owner(lock, owner))
                        break;
@@ -490,7 +481,7 @@ static inline int __sched
 __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+        struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
        if (!hold_ctx)
                return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
        prev = decode_cpu(old);
        node->prev = prev;
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /*
         * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!ACCESS_ONCE(node->locked)) {
+        while (!READ_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -148,7 +148,7 @@ unqueue:
                 * Or we race against a concurrent unqueue()'s step-B, in which
                 * case its step-C will write us a new @node->prev pointer.
                 */
-                prev = ACCESS_ONCE(node->prev);
+                prev = READ_ONCE(node->prev);
        }
        /*
@@ -170,8 +170,8 @@ unqueue:
         * it will wait in Step-A.
         */
-        ACCESS_ONCE(next->prev) = prev;
+        WRITE_ONCE(next->prev, prev);
-        ACCESS_ONCE(prev->next) = next;
+        WRITE_ONCE(prev->next, next);
        return false;
 }
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
                return;
        }
        next = osq_wait_next(lock, node, NULL);
        if (next)
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 *
 * @task:       the task owning the mutex (owner) for which a chain walk is
 *              probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk:     do we have to carry out deadlock detection?
 * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
 *              things for a task that has just got its priority adjusted, and
 *              is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                list_del(&waiter->list);
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
 /*
 * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                waiter = list_entry(next, struct rwsem_waiter, list);
                next = waiter->list.next;
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
                if (!list_is_singular(&sem->wait_list))
                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                rwsem_set_owner(sem);
                return true;
        }
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = ACCESS_ONCE(sem->count);
+        long old, count = READ_ONCE(sem->count);
        while (true) {
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
-                if (old == count)
+                if (old == count) {
+                        rwsem_set_owner(sem);
                        return true;
+                }
                count = old;
        }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = false;
+        bool ret = true;
        if (need_resched())
                return false;
        rcu_read_lock();
-        owner = ACCESS_ONCE(sem->owner);
+        owner = READ_ONCE(sem->owner);
-        if (owner)
+        if (!owner) {
-                on_cpu = owner->on_cpu;
+                long count = READ_ONCE(sem->count);
-        rcu_read_unlock();
+                /*
+                 * If sem->owner is not set, yet we have just recently entered the
-        /*
+                 * slowpath with the lock being active, then there is a possibility
-         * If sem->owner is not set, yet we have just recently entered the
+                 * reader(s) may have the lock. To be safe, bail spinning in these
-         * slowpath, then there is a possibility reader(s) may have the lock.
+                 * situations.
-         * To be safe, avoid spinning in these situations.
+                 */
-         */
+                if (count & RWSEM_ACTIVE_MASK)
-        return on_cpu;
+                        ret = false;
-}
+                goto done;
+        }
-static inline bool owner_running(struct rw_semaphore *sem,
-                                 struct task_struct *owner)
-{
-        if (sem->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * sem->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
+        ret = owner->on_cpu;
+done:
+        rcu_read_unlock();
+        return ret;
 }
 static noinline
 bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 {
+        long count;
        rcu_read_lock();
-        while (owner_running(sem, owner)) {
+        while (sem->owner == owner) {
-                if (need_resched())
+                /*
-                        break;
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking sem->owner still matches owner, if that fails,
+                 * owner might point to free()d memory, if it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                /* abort spinning when need_resched or owner is not running */
+                if (!owner->on_cpu || need_resched()) {
+                        rcu_read_unlock();
+                        return false;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
+        if (READ_ONCE(sem->owner))
+                return true; /* new owner, continue spinning */
        /*
-         * We break out the loop above on need_resched() or when the
+         * When the owner is not set, the lock could be free or
-         * owner changed, which is a sign for heavy contention. Return
+         * held by readers. Check the counter to verify the
-         * success only when sem->owner is NULL.
+         * state.
         */
-        return sem->owner == NULL;
+        count = READ_ONCE(sem->count);
+        return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                goto done;
        while (true) {
-                owner = ACCESS_ONCE(sem->owner);
+                owner = READ_ONCE(sem->owner);
                if (owner && !rwsem_spin_on_owner(sem, owner))
                        break;
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
        /* we're now waiting on the lock, but no longer actively locking */
        if (waiting) {
-                count = ACCESS_ONCE(sem->count);
+                count = READ_ONCE(sem->count);
                /*
                 * If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+#include "rwsem.h"
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-        sem->owner = current;
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-        sem->owner = NULL;
-}
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
 /*
 * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        sem->owner = current;
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        sem->owner = NULL;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..650b038ae520 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod)
        kfree(mod->args);
        percpu_modfree(mod);
-        /* Free lock-classes: */
+        /* Free lock-classes; relies on the preceding sync_rcu(). */
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
        return 0;
 }
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+        do {
+                unsigned long n = min(len, COPY_CHUNK_SIZE);
+                if (copy_from_user(dst, usrc, n) != 0)
+                        return -EFAULT;
+                cond_resched();
+                dst += n;
+                usrc += n;
+                len -= n;
+        } while (len);
+        return 0;
+}
 /* Sets info->hdr and info->len. */
 static int copy_module_from_user(const void __user *umod, unsigned long len,
                                  struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
        if (!info->hdr)
                return -ENOMEM;
-        if (copy_from_user(info->hdr, umod, info->len) != 0) {
+        if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
                vfree(info->hdr);
                return -EFAULT;
        }
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
        mod->trace_events = section_objs(info, "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
+        mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+                                        sizeof(*mod->trace_enums),
+                                        &mod->num_trace_enums);
 #endif
 #ifdef CONFIG_TRACING
        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
-        /* Free lock-classes: */
-        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* we can't deallocate the module until we clear memory protection */
        unset_module_init_ro_nx(mod);
        unset_module_core_ro_nx(mod);
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
+        /* Free lock-classes; relies on the preceding sync_rcu() */
+        lockdep_free_key_range(mod->module_core, mod->core_size);
        module_deallocate(mod, info);
 free_copy:
        free_copy(info);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a23bf93..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        }
 }
-static bool is_nosave_page(unsigned long pfn)
-{
-        struct nosave_region *region;
-        list_for_each_entry(region, &nosave_regions, list) {
-                if (pfn >= region->start_pfn && pfn < region->end_pfn) {
-                        pr_err("PM: %#010llx in e820 nosave region: "
-                               "[mem %#010llx-%#010llx]\n",
-                               (unsigned long long) pfn << PAGE_SHIFT,
-                               (unsigned long long) region->start_pfn << PAGE_SHIFT,
-                               ((unsigned long long) region->end_pfn << PAGE_SHIFT)
-                                        - 1);
-                        return true;
-                }
-        }
-        return false;
-}
 /**
 *      create_basic_memory_bitmaps - create bitmaps needed for marking page
 *      frames that should not be saved and free page frames.  The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        do {
                pfn = memory_bm_next_pfn(bm);
                if (likely(pfn != BM_END_OF_MAP)) {
-                        if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+                        if (likely(pfn_valid(pfn)))
                                swsusp_set_page_free(pfn_to_page(pfn));
                        else
                                return -EFAULT;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..2f7937ee9e3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
 /*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
 bool sched_can_stop_tick(void)
 {
        /*
+         * FIFO realtime policy runs the highest priority task. Other runnable
+         * tasks are of a lower priority. The scheduler tick does nothing.
+         */
+        if (current->policy == SCHED_FIFO)
+                return true;
+        /*
+         * Round-robin realtime tasks time slice with other tasks at the same
+         * realtime priority. Is this task the only one at this priority?
+         */
+        if (current->policy == SCHED_RR) {
+                struct sched_rt_entity *rt_se = &current->rt;
+                return rt_se->run_list.prev == rt_se->run_list.next;
+        }
+        /*
         * More than one running task need preemption.
         * nr_running update is assumed to be visible
         * after IPI is sent from wakers.
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq_clock_skip_update(rq, true);
 }
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+void register_task_migration_notifier(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&task_migration_notifier, n);
+}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
+                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+                tmn.task = p;
+                tmn.from_cpu = task_cpu(p);
+                tmn.to_cpu = new_cpu;
+                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        } else {
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
+                if (rt_prio(oldprio))
+                        p->rt.timeout = 0;
                p->sched_class = &fair_sched_class;
        }
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
 static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
-        unsigned long flags;
-        long cpu = (long)hcpu;
-        struct dl_bw *dl_b;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                set_cpu_active(cpu, false);
+                set_cpu_active((long)hcpu, false);
-                /* explicitly allow suspend */
-                if (!(action & CPU_TASKS_FROZEN)) {
-                        bool overflow;
-                        int cpus;
-                        rcu_read_lock_sched();
-                        dl_b = dl_bw_of(cpu);
-                        raw_spin_lock_irqsave(&dl_b->lock, flags);
-                        cpus = dl_bw_cpus(cpu);
-                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
-                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-                        rcu_read_unlock_sched();
-                        if (overflow)
-                                return notifier_from_errno(-EBUSY);
-                }
                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
        }
-        return NOTIFY_DONE;
 }
 static int __init migration_init(void)
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                /*
-                 * Even though we initialize ->capacity to something semi-sane,
-                 * we leave capacity_orig unset. This allows us to detect if
-                 * domain iteration is still funny without causing /0 traps.
-                 */
-                if (!group->sgc->capacity_orig) {
-                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
-                        break;
-                }
                if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        update_top_cache_domain(cpu);
 }
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-                sg->sgc->capacity_orig = sg->sgc->capacity;
                /*
                 * Make sure the first group of this domain contains the
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
         */
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
+                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
                sd->smt_gain = 1178; /* ~15% */
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 */
        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
                cpuset_update_active_cpus(true);
                break;
        default:
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action) {
+        unsigned long flags;
+        long cpu = (long)hcpu;
+        struct dl_bw *dl_b;
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
+                /* explicitly allow suspend */
+                if (!(action & CPU_TASKS_FROZEN)) {
+                        bool overflow;
+                        int cpus;
+                        rcu_read_lock_sched();
+                        dl_b = dl_bw_of(cpu);
+                        raw_spin_lock_irqsave(&dl_b->lock, flags);
+                        cpus = dl_bw_cpus(cpu);
+                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        rcu_read_unlock_sched();
+                        if (overflow)
+                                return notifier_from_errno(-EBUSY);
+                }
                cpuset_update_active_cpus(false);
                break;
        case CPU_DOWN_PREPARE_FROZEN:
@@ -7156,8 +7177,8 @@ void __init sched_init(void)
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
-                init_rt_rq(&rq->rt, rq);
+                init_rt_rq(&rq->rt);
-                init_dl_rq(&rq->dl, rq);
+                init_dl_rq(&rq->dl);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7197,7 +7218,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+                rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
 {
        u64 runtime = global_rt_runtime();
        u64 period = global_rt_period();
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
                if (ret)
                        goto undo;
-                ret = sched_rt_global_constraints();
+                ret = sched_dl_global_validate();
                if (ret)
                        goto undo;
-                ret = sched_dl_global_constraints();
+                ret = sched_rt_global_constraints();
                if (ret)
                        goto undo;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6d9403..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
        dl_b->total_bw = 0;
 }
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
 {
        dl_rq->rb_root = RB_ROOT;
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
        rq->post_schedule = has_pushable_dl_tasks(rq);
 }
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+        struct rq *later_rq = NULL;
+        bool fallback = false;
+        later_rq = find_lock_later_rq(p, rq);
+        if (!later_rq) {
+                int cpu;
+                /*
+                 * If we cannot preempt any rq, fall back to pick any
+                 * online cpu.
+                 */
+                fallback = true;
+                cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+                if (cpu >= nr_cpu_ids) {
+                        /*
+                         * Fail to find any suitable cpu.
+                         * The task will never come back!
+                         */
+                        BUG_ON(dl_bandwidth_enabled());
+                        /*
+                         * If admission control is disabled we
+                         * try a little harder to let the task
+                         * run.
+                         */
+                        cpu = cpumask_any(cpu_active_mask);
+                }
+                later_rq = cpu_rq(cpu);
+                double_lock_balance(rq, later_rq);
+        }
+        deactivate_task(rq, p, 0);
+        set_task_cpu(p, later_rq->cpu);
+        activate_task(later_rq, p, ENQUEUE_REPLENISH);
+        if (!fallback)
+                resched_curr(later_rq);
+        double_unlock_balance(rq, later_rq);
+}
 #else
 static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        unsigned long flags;
        struct rq *rq;
-        rq = task_rq_lock(current, &flags);
+        rq = task_rq_lock(p, &flags);
        /*
         * We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        sched_clock_tick();
        update_rq_clock(rq);
+#ifdef CONFIG_SMP
+        /*
+         * If we find that the rq the task was on is no longer
+         * available, we need to select a new rq.
+         */
+        if (unlikely(!rq->online)) {
+                dl_task_offline_migration(rq, p);
+                goto unlock;
+        }
+#endif
        /*
         * If the throttle happened during sched-out; like:
         *
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                push_dl_task(rq);
 #endif
 unlock:
-        task_rq_unlock(rq, current, &flags);
+        task_rq_unlock(rq, p, &flags);
        return HRTIMER_NORESTART;
 }
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
        }
        update_rq_clock(rq);
        update_curr_dl(rq);
+        /*
+         * Tell update_rq_clock() that we've just updated,
+         * so we don't do microscopic update in schedule()
+         * and double the fastpath cost.
+         */
+        rq_clock_skip_update(rq, true);
 }
 #ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
        int check_resched = 1;
-        /*
-         * If p is throttled, don't consider the possibility
-         * of preempting rq->curr, the check will be done right
-         * after its runtime will get replenished.
-         */
-        if (unlikely(p->dl.dl_throttled))
-                return;
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf858d25c..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        if (!se) {
                struct sched_avg *avg = &cpu_rq(cpu)->avg;
                P(avg->runnable_avg_sum);
-                P(avg->runnable_avg_period);
+                P(avg->avg_period);
                return;
        }
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.runnable_avg_sum);
-        P(se->avg.runnable_avg_period);
+        P(se->avg.running_avg_sum);
+        P(se->avg.avg_period);
        P(se->avg.load_avg_contrib);
+        P(se->avg.utilization_avg_contrib);
        P(se->avg.decay_count);
 #endif
 #undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
                        cfs_rq->blocked_load_avg);
+        SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
+                        cfs_rq->utilization_load_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.load.weight);
 #ifdef CONFIG_SMP
        P(se.avg.runnable_avg_sum);
-        P(se.avg.runnable_avg_period);
+        P(se.avg.running_avg_sum);
+        P(se.avg.avg_period);
        P(se.avg.load_avg_contrib);
+        P(se.avg.utilization_avg_contrib);
        P(se.avg.decay_count);
 #endif
        P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcfe32088b37..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
 void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
        u32 slice;
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
-        p->se.avg.runnable_avg_sum = slice;
+        p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
-        p->se.avg.runnable_avg_period = slice;
+        p->se.avg.avg_period = slice;
        __update_task_entity_contrib(&p->se);
+        __update_task_entity_utilization(&p->se);
 }
 #else
 void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
                                struct task_numa_env *env)
 {
-        long imb, old_imb;
-        long orig_src_load, orig_dst_load;
        long src_capacity, dst_capacity;
+        long orig_src_load;
+        long load_a, load_b;
+        long moved_load;
+        long imb;
        /*
         * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        dst_capacity = env->dst_stats.compute_capacity;
        /* We care about the slope of the imbalance, not the direction. */
-        if (dst_load < src_load)
+        load_a = dst_load;
-                swap(dst_load, src_load);
+        load_b = src_load;
+        if (load_a < load_b)
+                swap(load_a, load_b);
        /* Is the difference below the threshold? */
-        imb = dst_load * src_capacity * 100 -
+        imb = load_a * src_capacity * 100 -
-              src_load * dst_capacity * env->imbalance_pct;
+                load_b * dst_capacity * env->imbalance_pct;
        if (imb <= 0)
                return false;
        /*
         * The imbalance is above the allowed threshold.
-         * Compare it with the old imbalance.
+         * Allow a move that brings us closer to a balanced situation,
+         * without moving things past the point of balance.
         */
        orig_src_load = env->src_stats.load;
-        orig_dst_load = env->dst_stats.load;
-        if (orig_dst_load < orig_src_load)
+        /*
-                swap(orig_dst_load, orig_src_load);
+         * In a task swap, there will be one load moving from src to dst,
+         * and another moving back. This is the net sum of both moves.
-        old_imb = orig_dst_load * src_capacity * 100 -
+         * A simple task move will always have a positive value.
-                  orig_src_load * dst_capacity * env->imbalance_pct;
+         * Allow the move if it brings the system closer to a balanced
+         * situation, without crossing over the balance point.
+         */
+        moved_load = orig_src_load - src_load;
-        /* Would this change make things worse? */
+        if (moved_load > 0)
-        return (imb > old_imb);
+                /* Moving src -> dst. Did we overshoot balance? */
+                return src_load * dst_capacity < dst_load * src_capacity;
+        else
+                /* Moving dst -> src. Did we overshoot balance? */
+                return dst_load * src_capacity < src_load * dst_capacity;
 }
 /*
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
                *period = now - p->last_task_numa_placement;
        } else {
                delta = p->se.avg.runnable_avg_sum;
-                *period = p->se.avg.runnable_avg_period;
+                *period = p->se.avg.avg_period;
        }
        p->last_sum_exec_runtime = runtime;
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
                        }
                }
                /* Next round, evaluate the nodes within max_group. */
+                if (!max_faults)
+                        break;
                nodes = max_group;
        }
        return nid;
@@ -2165,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma) || !vma_policy_mof(vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+                        is_vm_hugetlb_page(vma)) {
                        continue;
+                }
                /*
                 * Shared library pages mapped by multiple processes are not
@@ -2501,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 */
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                                                        struct sched_avg *sa,
-                                                        int runnable)
+                                                        int runnable,
+                                                        int running)
 {
        u64 delta, periods;
        u32 runnable_contrib;
        int delta_w, decayed = 0;
+        unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
        delta = now - sa->last_runnable_update;
        /*
@@ -2529,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
        sa->last_runnable_update = now;
        /* delta_w is the amount already accumulated against our next period */
-        delta_w = sa->runnable_avg_period % 1024;
+        delta_w = sa->avg_period % 1024;
        if (delta + delta_w >= 1024) {
                /* period roll-over */
                decayed = 1;
@@ -2542,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                delta_w = 1024 - delta_w;
                if (runnable)
                        sa->runnable_avg_sum += delta_w;
-                sa->runnable_avg_period += delta_w;
+                if (running)
+                        sa->running_avg_sum += delta_w * scale_freq
+                                >> SCHED_CAPACITY_SHIFT;
+                sa->avg_period += delta_w;
                delta -= delta_w;
@@ -2552,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                  periods + 1);
-                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+                sa->running_avg_sum = decay_load(sa->running_avg_sum,
+                                                  periods + 1);
+                sa->avg_period = decay_load(sa->avg_period,
                                                     periods + 1);
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                runnable_contrib = __compute_runnable_contrib(periods);
                if (runnable)
                        sa->runnable_avg_sum += runnable_contrib;
-                sa->runnable_avg_period += runnable_contrib;
+                if (running)
+                        sa->running_avg_sum += runnable_contrib * scale_freq
+                                >> SCHED_CAPACITY_SHIFT;
+                sa->avg_period += runnable_contrib;
        }
        /* Remainder of delta accrued against u_0` */
        if (runnable)
                sa->runnable_avg_sum += delta;
-        sa->runnable_avg_period += delta;
+        if (running)
+                sa->running_avg_sum += delta * scale_freq
+                        >> SCHED_CAPACITY_SHIFT;
+        sa->avg_period += delta;
        return decayed;
 }
@@ -2582,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
                return 0;
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+        se->avg.utilization_avg_contrib =
+                decay_load(se->avg.utilization_avg_contrib, decays);
        return decays;
 }
@@ -2617,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
        /* The fraction of a cpu used by this cfs_rq */
        contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
-                          sa->runnable_avg_period + 1);
+                          sa->avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2670,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-        __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+        __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+                        runnable, runnable);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2688,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-        contrib /= (se->avg.runnable_avg_period + 1);
+        contrib /= (se->avg.avg_period + 1);
        se->avg.load_avg_contrib = scale_load(contrib);
 }
@@ -2707,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
        return se->avg.load_avg_contrib - old_contrib;
 }
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+        u32 contrib;
+        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+        contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+        contrib /= (se->avg.avg_period + 1);
+        se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+        long old_contrib = se->avg.utilization_avg_contrib;
+        if (entity_is_task(se))
+                __update_task_entity_utilization(se);
+        else
+                se->avg.utilization_avg_contrib =
+                                        group_cfs_rq(se)->utilization_load_avg;
+        return se->avg.utilization_avg_contrib - old_contrib;
+}
 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
                                                 long load_contrib)
 {
@@ -2723,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        long contrib_delta;
+        long contrib_delta, utilization_delta;
+        int cpu = cpu_of(rq_of(cfs_rq));
        u64 now;
        /*
@@ -2735,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
        else
                now = cfs_rq_clock_task(group_cfs_rq(se));
-        if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+        if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+                                        cfs_rq->curr == se))
                return;
        contrib_delta = __update_entity_load_avg_contrib(se);
+        utilization_delta = __update_entity_utilization_avg_contrib(se);
        if (!update_cfs_rq)
                return;
-        if (se->on_rq)
+        if (se->on_rq) {
                cfs_rq->runnable_load_avg += contrib_delta;
-        else
+                cfs_rq->utilization_load_avg += utilization_delta;
+        } else {
                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+        }
 }
 /*
@@ -2821,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
        }
        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+        cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
        /* we force update consideration on load-balancer moves */
        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -2839,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
        update_cfs_rq_blocked_load(cfs_rq, !sleep);
        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+        cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
        if (sleep) {
                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3176,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
+                update_entity_load_avg(se, 1);
        }
        update_stats_curr_start(cfs_rq, se);
@@ -4302,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
        return cpu_rq(cpu)->cpu_capacity;
 }
+static unsigned long capacity_orig_of(int cpu)
+{
+        return cpu_rq(cpu)->cpu_capacity_orig;
+}
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -4715,6 +4785,33 @@ next:
 done:
        return target;
 }
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+        unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+        unsigned long capacity = capacity_orig_of(cpu);
+        if (usage >= SCHED_LOAD_SCALE)
+                return capacity;
+        return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
 /*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5841,12 +5938,12 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
+        unsigned long group_usage; /* Total usage of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
-        unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
        enum group_type group_type;
-        int group_has_free_capacity;
+        int group_no_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -5917,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
-        return SCHED_CAPACITY_SCALE;
-}
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
-        return default_scale_capacity(sd, cpu);
-}
 static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5943,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available, age_stamp, avg;
+        u64 total, used, age_stamp, avg;
        s64 delta;
        /*
@@ -5959,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
        total = sched_avg_period() + delta;
-        if (unlikely(total < avg)) {
+        used = div_u64(avg, total);
-                /* Ensures that capacity won't end up being negative */
-                available = 0;
-        } else {
-                available = total - avg;
-        }
-        if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
-                total = SCHED_CAPACITY_SCALE;
-        total >>= SCHED_CAPACITY_SHIFT;
+        if (likely(used < SCHED_CAPACITY_SCALE))
+                return SCHED_CAPACITY_SCALE - used;
-        return div_u64(available, total);
+        return 1;
 }
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5986,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        capacity >>= SCHED_CAPACITY_SHIFT;
-        sdg->sgc->capacity_orig = capacity;
+        cpu_rq(cpu)->cpu_capacity_orig = capacity;
-        if (sched_feat(ARCH_CAPACITY))
-                capacity *= arch_scale_freq_capacity(sd, cpu);
-        else
-                capacity *= default_scale_capacity(sd, cpu);
-        capacity >>= SCHED_CAPACITY_SHIFT;
        capacity *= scale_rt_capacity(cpu);
        capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6009,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long capacity, capacity_orig;
+        unsigned long capacity;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -6021,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                return;
        }
-        capacity_orig = capacity = 0;
+        capacity = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -6041,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         * Use capacity_of(), which is set irrespective of domains
                         * in update_cpu_capacity().
                         *
-                         * This avoids capacity/capacity_orig from being 0 and
+                         * This avoids capacity from being 0 and
                         * causing divide-by-zero issues on boot.
-                         *
-                         * Runtime updates will correct capacity_orig.
                         */
                        if (unlikely(!rq->sd)) {
-                                capacity_orig += capacity_of(cpu);
                                capacity += capacity_of(cpu);
                                continue;
                        }
                        sgc = rq->sd->groups->sgc;
-                        capacity_orig += sgc->capacity_orig;
                        capacity += sgc->capacity;
                }
        } else  {
@@ -6064,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
-                        capacity_orig += group->sgc->capacity_orig;
                        capacity += group->sgc->capacity;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgc->capacity_orig = capacity_orig;
        sdg->sgc->capacity = capacity;
 }
 /*
- * Try and fix up capacity for tiny siblings, this is needed when
+ * Check whether the capacity of the rq has been noticeably reduced by side
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * activity. The imbalance_pct is used for the threshold.
- * which on its own isn't powerful enough.
+ * Return true is the capacity is reduced
- *
- * See update_sd_pick_busiest() and check_asym_packing().
 */
 static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
-        /*
+        return ((rq->cpu_capacity * sd->imbalance_pct) <
-         * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
+                                (rq->cpu_capacity_orig * 100));
-         */
-        if (!(sd->flags & SD_SHARE_CPUCAPACITY))
-                return 0;
-        /*
-         * If ~90% of the cpu_capacity is still there, we're good.
-         */
-        if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
-                return 1;
-        return 0;
 }
 /*
@@ -6134,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
 }
 /*
- * Compute the group capacity factor.
+ * group_has_capacity returns true if the group has spare capacity that could
- *
+ * be used by some tasks.
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
+ * We consider that a group has spare capacity if the  * number of task is
- * first dividing out the smt factor and computing the actual number of cores
+ * smaller than the number of CPUs or if the usage is lower than the available
- * and limit unit capacity with that.
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
 */
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 {
-        unsigned int capacity_factor, smt, cpus;
+        if (sgs->sum_nr_running < sgs->group_weight)
-        unsigned int capacity, capacity_orig;
+                return true;
-        capacity = group->sgc->capacity;
+        if ((sgs->group_capacity * 100) >
-        capacity_orig = group->sgc->capacity_orig;
+                        (sgs->group_usage * env->sd->imbalance_pct))
-        cpus = group->group_weight;
+                return true;
+        return false;
+}
-        /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+/*
-        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ *  group_is_overloaded returns true if the group has more tasks than it can
-        capacity_factor = cpus / smt; /* cores */
+ *  handle.
+ *  group_is_overloaded is not equals to !group_has_capacity because a group
+ *  with the exact right number of tasks, has no more spare capacity but is not
+ *  overloaded so both group_has_capacity and group_is_overloaded return
+ *  false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running <= sgs->group_weight)
+                return false;
-        capacity_factor = min_t(unsigned,
+        if ((sgs->group_capacity * 100) <
-                capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+                        (sgs->group_usage * env->sd->imbalance_pct))
-        if (!capacity_factor)
+                return true;
-                capacity_factor = fix_small_capacity(env->sd, group);
-        return capacity_factor;
+        return false;
 }
-static enum group_type
+static enum group_type group_classify(struct lb_env *env,
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+                struct sched_group *group,
+                struct sg_lb_stats *sgs)
 {
-        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+        if (sgs->group_no_capacity)
                return group_overloaded;
        if (sg_imbalanced(group))
@@ -6202,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
+                sgs->group_usage += get_cpu_usage(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
                if (rq->nr_running > 1)
@@ -6224,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
-        sgs->group_capacity_factor = sg_capacity_factor(env, group);
-        sgs->group_type = group_classify(group, sgs);
-        if (sgs->group_capacity_factor > sgs->sum_nr_running)
+        sgs->group_no_capacity = group_is_overloaded(env, sgs);
-                sgs->group_has_free_capacity = 1;
+        sgs->group_type = group_classify(env, group, sgs);
 }
 /**
@@ -6350,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                /*
                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the sg capacity factor to one so that we'll try
+                 * first, lower the sg capacity so that we'll try
                 * and move all the excess tasks away. We lower the capacity
                 * of a group only if the local group has the capacity to fit
-                 * these excess tasks, i.e. nr_running < group_capacity_factor. The
+                 * these excess tasks. The extra check prevents the case where
-                 * extra check prevents the case where you always pull from the
+                 * you always pull from the heaviest group when it is already
-                 * heaviest group when it is already under-utilized (possible
+                 * under-utilized (possible with a large weight task outweighs
-                 * with a large weight task outweighs the tasks on the system).
+                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                    sds->local_stat.group_has_free_capacity) {
+                    group_has_capacity(env, &sds->local_stat) &&
-                        sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+                    (sgs->sum_nr_running > 1)) {
-                        sgs->group_type = group_classify(sg, sgs);
+                        sgs->group_no_capacity = 1;
+                        sgs->group_type = group_overloaded;
                }
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6541,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         */
        if (busiest->group_type == group_overloaded &&
            local->group_type   == group_overloaded) {
-                load_above_capacity =
+                load_above_capacity = busiest->sum_nr_running *
-                        (busiest->sum_nr_running - busiest->group_capacity_factor);
+                                        SCHED_LOAD_SCALE;
+                if (load_above_capacity > busiest->group_capacity)
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+                        load_above_capacity -= busiest->group_capacity;
-                load_above_capacity /= busiest->group_capacity;
+                else
+                        load_above_capacity = ~0UL;
        }
        /*
@@ -6608,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        local = &sds.local_stat;
        busiest = &sds.busiest_stat;
+        /* ASYM feature bypasses nice load balance check */
        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
            check_asym_packing(env, &sds))
                return sds.busiest;
@@ -6628,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
-            !busiest->group_has_free_capacity)
+            busiest->group_no_capacity)
                goto force_balance;
        /*
@@ -6688,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long capacity, capacity_factor, wl;
+                unsigned long capacity, wl;
                enum fbq_type rt;
                rq = cpu_rq(i);
@@ -6717,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                        continue;
                capacity = capacity_of(i);
-                capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
-                if (!capacity_factor)
-                        capacity_factor = fix_small_capacity(env->sd, group);
                wl = weighted_cpuload(i);
@@ -6727,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 * When comparing with imbalance, use weighted_cpuload()
                 * which is not scaled with the cpu capacity.
                 */
-                if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+                if (rq->nr_running == 1 && wl > env->imbalance &&
+                    !check_cpu_capacity(rq, env->sd))
                        continue;
                /*
@@ -6775,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
                        return 1;
        }
+        /*
+         * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+         * It's worth migrating the task if the src_cpu's capacity is reduced
+         * because of other sched_class or IRQs if more capacity stays
+         * available on dst_cpu.
+         */
+        if ((env->idle != CPU_NOT_IDLE) &&
+            (env->src_rq->cfs.h_nr_running == 1)) {
+                if ((check_cpu_capacity(env->src_rq, sd)) &&
+                    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+                        return 1;
+        }
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
@@ -6874,6 +6961,9 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+        env.src_cpu = busiest->cpu;
+        env.src_rq = busiest;
        ld_moved = 0;
        if (busiest->nr_running > 1) {
                /*
@@ -6883,8 +6973,6 @@ redo:
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-                env.src_cpu   = busiest->cpu;
-                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
@@ -7584,22 +7672,25 @@ end:
 /*
 * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
 *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
+ *   - This rq has at least one CFS task and the capacity of the CPU is
- *     busy cpu's exceeding the group's capacity.
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
+        bool kick = false;
        if (unlikely(rq->idle_balance))
-                return 0;
+                return false;
       /*
        * We may be recently in ticked or tickless idle mode. At the first
@@ -7613,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
         * balancing.
         */
        if (likely(!atomic_read(&nohz.nr_cpus)))
-                return 0;
+                return false;
        if (time_before(now, nohz.next_balance))
-                return 0;
+                return false;
        if (rq->nr_running >= 2)
-                goto need_kick;
+                return true;
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (sd) {
                sgc = sd->groups->sgc;
                nr_busy = atomic_read(&sgc->nr_busy_cpus);
-                if (nr_busy > 1)
+                if (nr_busy > 1) {
-                        goto need_kick_unlock;
+                        kick = true;
+                        goto unlock;
+                }
        }
-        sd = rcu_dereference(per_cpu(sd_asym, cpu));
+        sd = rcu_dereference(rq->sd);
+        if (sd) {
+                if ((rq->cfs.h_nr_running >= 1) &&
+                                check_cpu_capacity(rq, sd)) {
+                        kick = true;
+                        goto unlock;
+                }
+        }
+        sd = rcu_dereference(per_cpu(sd_asym, cpu));
        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
-                                  sched_domain_span(sd)) < cpu))
+                                  sched_domain_span(sd)) < cpu)) {
-                goto need_kick_unlock;
+                kick = true;
+                goto unlock;
-        rcu_read_unlock();
+        }
-        return 0;
-need_kick_unlock:
+unlock:
        rcu_read_unlock();
-need_kick:
+        return kick;
-        return 1;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7660,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_rq, idle);
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
+         * stopped. Do nohz_idle_balance *before* rebalance_domains to
+         * give the idle cpus a chance to load balance. Else we may
+         * load balance only within the local sched_domain hierarchy
+         * and abort nohz_idle_balance altogether if we pull some load.
         */
        nohz_idle_balance(this_rq, idle);
+        rebalance_domains(this_rq, idle);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
 */
 SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d27d36476dca..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
         * is used from another cpu as a broadcast timer, this call may
         * fail if it is not available
         */
-        if (broadcast &&
+        if (broadcast && tick_broadcast_enter())
-            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
        /* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
        idle_set_state(this_rq(), NULL);
        if (broadcast)
-                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+                tick_broadcast_exit();
        /*
         * Give the governor an opportunity to reflect on the outcome
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b077eba0..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
 #include "sched.h"
 #include <linux/slab.h>
+#include <linux/irq_work.h>
 int sched_rr_timeslice = RR_TIMESLICE;
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+void init_rt_rq(struct rt_rq *rt_rq)
 {
        struct rt_prio_array *array;
        int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
+#ifdef HAVE_RT_PUSH_IPI
+        rt_rq->push_flags = 0;
+        rt_rq->push_cpu = nr_cpu_ids;
+        raw_spin_lock_init(&rt_rq->push_lock);
+        init_irq_work(&rt_rq->push_work, push_irq_work_func);
 #endif
+#endif /* CONFIG_SMP */
        /* We start is dequeued state, because no RT tasks are queued */
        rt_rq->rt_queued = 0;
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
-                init_rt_rq(rt_rq, cpu_rq(i));
+                init_rt_rq(rt_rq);
                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+        int prev_cpu = rq->rt.push_cpu;
+        int cpu;
+        cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+        /*
+         * If the previous cpu is less than the rq's CPU, then it already
+         * passed the end of the mask, and has started from the beginning.
+         * We end if the next CPU is greater or equal to rq's CPU.
+         */
+        if (prev_cpu < rq->cpu) {
+                if (cpu >= rq->cpu)
+                        return nr_cpu_ids;
+        } else if (cpu >= nr_cpu_ids) {
+                /*
+                 * We passed the end of the mask, start at the beginning.
+                 * If the result is greater or equal to the rq's CPU, then
+                 * the loop is finished.
+                 */
+                cpu = cpumask_first(rq->rd->rto_mask);
+                if (cpu >= rq->cpu)
+                        return nr_cpu_ids;
+        }
+        rq->rt.push_cpu = cpu;
+        /* Return cpu to let the caller know if the loop is finished or not */
+        return cpu;
+}
+static int find_next_push_cpu(struct rq *rq)
+{
+        struct rq *next_rq;
+        int cpu;
+        while (1) {
+                cpu = rto_next_cpu(rq);
+                if (cpu >= nr_cpu_ids)
+                        break;
+                next_rq = cpu_rq(cpu);
+                /* Make sure the next rq can push to this rq */
+                if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+                        break;
+        }
+        return cpu;
+}
+#define RT_PUSH_IPI_EXECUTING           1
+#define RT_PUSH_IPI_RESTART             2
+static void tell_cpu_to_push(struct rq *rq)
+{
+        int cpu;
+        if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                raw_spin_lock(&rq->rt.push_lock);
+                /* Make sure it's still executing */
+                if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                        /*
+                         * Tell the IPI to restart the loop as things have
+                         * changed since it started.
+                         */
+                        rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+                        raw_spin_unlock(&rq->rt.push_lock);
+                        return;
+                }
+                raw_spin_unlock(&rq->rt.push_lock);
+        }
+        /* When here, there's no IPI going around */
+        rq->rt.push_cpu = rq->cpu;
+        cpu = find_next_push_cpu(rq);
+        if (cpu >= nr_cpu_ids)
+                return;
+        rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+        irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+        struct rt_rq *rt_rq = arg;
+        struct rq *rq, *src_rq;
+        int this_cpu;
+        int cpu;
+        this_cpu = rt_rq->push_cpu;
+        /* Paranoid check */
+        BUG_ON(this_cpu != smp_processor_id());
+        rq = cpu_rq(this_cpu);
+        src_rq = rq_of_rt_rq(rt_rq);
+again:
+        if (has_pushable_tasks(rq)) {
+                raw_spin_lock(&rq->lock);
+                push_rt_task(rq);
+                raw_spin_unlock(&rq->lock);
+        }
+        /* Pass the IPI to the next rt overloaded queue */
+        raw_spin_lock(&rt_rq->push_lock);
+        /*
+         * If the source queue changed since the IPI went out,
+         * we need to restart the search from that CPU again.
+         */
+        if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+                rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+                rt_rq->push_cpu = src_rq->cpu;
+        }
+        cpu = find_next_push_cpu(src_rq);
+        if (cpu >= nr_cpu_ids)
+                rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+        raw_spin_unlock(&rt_rq->push_lock);
+        if (cpu >= nr_cpu_ids)
+                return;
+        /*
+         * It is possible that a restart caused this CPU to be
+         * chosen again. Don't bother with an IPI, just see if we
+         * have more to push.
+         */
+        if (unlikely(cpu == rq->cpu))
+                goto again;
+        /* Try the next RT overloaded CPU */
+        irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+static void push_irq_work_func(struct irq_work *work)
+{
+        struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+        try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
 static int pull_rt_task(struct rq *this_rq)
 {
        int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
         */
        smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+        if (sched_feat(RT_PUSH_IPI)) {
+                tell_cpu_to_push(this_rq);
+                return 0;
+        }
+#endif
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435a2779..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
+#include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
@@ -362,8 +363,14 @@ struct cfs_rq {
         * Under CFS, load is tracked on a per-entity basis and aggregated up.
         * This allows for the description of both thread and group usage (in
         * the FAIR_GROUP_SCHED case).
+         * runnable_load_avg is the sum of the load_avg_contrib of the
+         * sched_entities on the rq.
+         * blocked_load_avg is similar to runnable_load_avg except that its
+         * the blocked sched_entities on the rq.
+         * utilization_load_avg is the sum of the average running time of the
+         * sched_entities on the rq.
         */
-        unsigned long runnable_load_avg, blocked_load_avg;
+        unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
        atomic64_t decay_counter;
        u64 last_decay;
        atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
        unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+        int push_flags;
+        int push_cpu;
+        struct irq_work push_work;
+        raw_spinlock_t push_lock;
 #endif
+#endif /* CONFIG_SMP */
        int rt_queued;
        int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
        struct sched_domain *sd;
        unsigned long cpu_capacity;
+        unsigned long cpu_capacity_orig;
        unsigned char idle_balance;
        /* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
         * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
         * for a single CPU.
         */
-        unsigned int capacity, capacity_orig;
+        unsigned int capacity;
        unsigned long next_update;
        int imbalance; /* XXX unrelated to capacity but shared group state */
        /*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
 #ifdef CONFIG_SMP
 extern void sched_avg_update(struct rq *rq);
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+        return SCHED_CAPACITY_SCALE;
+}
+#endif
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
-        rq->rt_avg += rt_delta;
+        rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
        sched_avg_update(rq);
 }
 #else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..ce410bb9f2e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
+                .procname       = "dirtytime_expire_seconds",
+                .data           = &dirtytime_expire_interval,
+                .maxlen         = sizeof(dirty_expire_interval),
+                .mode           = 0644,
+                .proc_handler   = dirtytime_interval_handler,
+                .extra1         = &zero,
+        },
+        {
                .procname       = "nr_pdflush_threads",
                .mode           = 0444 /* read-only */,
                .proc_handler   = pdflush_proc_obsolete,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
 config GENERIC_CLOCKEVENTS
        bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
-        bool
-        default y
-        depends on GENERIC_CLOCKEVENTS
 # Architecture can handle broadcast in a driver-agnostic way
 config ARCH_HAS_TICK_BROADCAST
        bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c07817d7a..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += clockevents.o tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
 ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 obj-y                                          += tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)                     += tick-broadcast-hrtimer.o
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)               += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TICK_ONESHOT)                      += tick-sched.o
 obj-$(CONFIG_TIMER_STATS)                       += timer_stats.o
 obj-$(CONFIG_DEBUG_FS)                          += timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)                       += test_udelay.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..25d942d1da27 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_set_state(struct clock_event_device *dev,
+                                   enum clock_event_state state)
+{
+        /* Transition with legacy set_mode() callback */
+        if (dev->set_mode) {
+                /* Legacy callback doesn't support new modes */
+                if (state > CLOCK_EVT_STATE_ONESHOT)
+                        return -ENOSYS;
+                /*
+                 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+                 * mapping until *_ONESHOT, and so a simple cast will work.
+                 */
+                dev->set_mode((enum clock_event_mode)state, dev);
+                dev->mode = (enum clock_event_mode)state;
+                return 0;
+        }
+        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+                return 0;
+        /* Transition with new state-specific callbacks */
+        switch (state) {
+        case CLOCK_EVT_STATE_DETACHED:
+                /*
+                 * This is an internal state, which is guaranteed to go from
+                 * SHUTDOWN to DETACHED. No driver interaction required.
+                 */
+                return 0;
+        case CLOCK_EVT_STATE_SHUTDOWN:
+                return dev->set_state_shutdown(dev);
+        case CLOCK_EVT_STATE_PERIODIC:
+                /* Core internal bug */
+                if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+                        return -ENOSYS;
+                return dev->set_state_periodic(dev);
+        case CLOCK_EVT_STATE_ONESHOT:
+                /* Core internal bug */
+                if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                        return -ENOSYS;
+                return dev->set_state_oneshot(dev);
+        default:
+                return -ENOSYS;
+        }
+}
 /**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
 * @dev:        device to modify
- * @mode:       new mode
+ * @state:      new state
 *
 * Must be called with interrupts disabled !
 */
-void clockevents_set_mode(struct clock_event_device *dev,
+void clockevents_set_state(struct clock_event_device *dev,
-                                 enum clock_event_mode mode)
+                           enum clock_event_state state)
 {
-        if (dev->mode != mode) {
+        if (dev->state != state) {
-                dev->set_mode(mode, dev);
+                if (__clockevents_set_state(dev, state))
-                dev->mode = mode;
+                        return;
+                dev->state = state;
                /*
                 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
                 * on it, so fix it up and emit a warning:
                 */
-                if (mode == CLOCK_EVT_MODE_ONESHOT) {
+                if (state == CLOCK_EVT_STATE_ONESHOT) {
                        if (unlikely(!dev->mult)) {
                                dev->mult = 1;
                                WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
 */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
        dev->next_event.tv64 = KTIME_MAX;
 }
+/**
+ * clockevents_tick_resume -    Resume the tick device before using it again
+ * @dev:                        device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+        int ret = 0;
+        if (dev->set_mode) {
+                dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+                dev->mode = CLOCK_EVT_MODE_RESUME;
+        } else if (dev->tick_resume) {
+                ret = dev->tick_resume(dev);
+        }
+        return ret;
+}
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
 /* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
                delta = dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);
-                if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                        return 0;
                dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
        delta = dev->min_delta_ns;
        dev->next_event = ktime_add_ns(ktime_get(), delta);
-        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+        if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
        dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
        dev->next_event = expires;
-        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+        if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
        /* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
        struct clock_event_device *dev, *newdev = NULL;
        list_for_each_entry(dev, &clockevent_devices, list) {
-                if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+                if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
                        continue;
                if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
        /* Fast track. Device is unused */
-        if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+        if (ced->state == CLOCK_EVT_STATE_DETACHED) {
                list_del_init(&ced->list);
                return 0;
        }
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+        /* Legacy set_mode() callback */
+        if (dev->set_mode) {
+                /* We shouldn't be supporting new modes now */
+                WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+                        dev->set_state_shutdown || dev->tick_resume);
+                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+                return 0;
+        }
+        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+                return 0;
+        /* New state-specific callbacks */
+        if (!dev->set_state_shutdown)
+                return -EINVAL;
+        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+            !dev->set_state_periodic)
+                return -EINVAL;
+        if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+            !dev->set_state_oneshot)
+                return -EINVAL;
+        return 0;
+}
 /**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
-        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+        BUG_ON(clockevents_sanity_check(dev));
+        /* Initialize state to DETACHED */
+        dev->state = CLOCK_EVT_STATE_DETACHED;
        if (!dev->cpumask) {
                WARN_ON(num_possible_cpus() > 1);
                dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
        clockevents_config(dev, freq);
-        if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+        if (dev->state == CLOCK_EVT_STATE_ONESHOT)
                return clockevents_program_event(dev, dev->next_event, false);
-        if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+        if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-                dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+                return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
        return 0;
 }
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
 * @old:        device to release (can be NULL)
 * @new:        device to request (can be NULL)
 *
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
 */
 void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
 {
-        unsigned long flags;
-        local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                module_put(old->owner);
-                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+                clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
        }
        if (new) {
-                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+                BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
                clockevents_shutdown(new);
        }
-        local_irq_restore(flags);
 }
 /**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
                        dev->resume(dev);
 }
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
 /**
- * clockevents_notify - notification about relevant events
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
- * Returns 0 on success, any other value on error
 */
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
 {
        struct clock_event_device *dev, *tmp;
        unsigned long flags;
-        int cpu, ret = 0;
        raw_spin_lock_irqsave(&clockevents_lock, flags);
-        switch (reason) {
+        tick_shutdown_broadcast_oneshot(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        tick_shutdown_broadcast(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+        tick_shutdown(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+        /*
-                tick_broadcast_on_off(reason, arg);
+         * Unregister the clock event devices which were
-                break;
+         * released from the users in the notify chain.
+         */
-        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+        list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
-        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+                list_del(&dev->list);
-                ret = tick_broadcast_oneshot_control(reason);
+        /*
-                break;
+         * Now check whether the CPU has left unused per cpu devices
+         */
-        case CLOCK_EVT_NOTIFY_CPU_DYING:
+        list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
-                tick_handover_do_timer(arg);
+                if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                break;
+                    cpumask_weight(dev->cpumask) == 1 &&
+                    !tick_is_broadcast_device(dev)) {
-        case CLOCK_EVT_NOTIFY_SUSPEND:
+                        BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
-                tick_suspend();
-                tick_suspend_broadcast();
-                break;
-        case CLOCK_EVT_NOTIFY_RESUME:
-                tick_resume();
-                break;
-        case CLOCK_EVT_NOTIFY_CPU_DEAD:
-                tick_shutdown_broadcast_oneshot(arg);
-                tick_shutdown_broadcast(arg);
-                tick_shutdown(arg);
-                /*
-                 * Unregister the clock event devices which were
-                 * released from the users in the notify chain.
-                 */
-                list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
                        list_del(&dev->list);
-                /*
-                 * Now check whether the CPU has left unused per cpu devices
-                 */
-                cpu = *((int *)arg);
-                list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
-                        if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                            cpumask_weight(dev->cpumask) == 1 &&
-                            !tick_is_broadcast_device(dev)) {
-                                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-                                list_del(&dev->list);
-                        }
                }
-                break;
-        default:
-                break;
        }
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
-        return ret;
 }
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
 #ifdef CONFIG_SYSFS
 struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
 }
 device_initcall(clockevents_init_sysfs);
 #endif /* SYSFS */
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
                schedule_work(&watchdog_work);
 }
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-               cs->name, delta);
-        __clocksource_unstable(cs);
-}
 /**
 * clocksource_mark_unstable - mark clocksource unstable via watchdog
 * @cs:         clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
        struct clocksource *cs;
-        cycle_t csnow, wdnow, delta;
+        cycle_t csnow, wdnow, cslast, wdlast, delta;
        int64_t wd_nsec, cs_nsec;
        int next_cpu, reset_pending;
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
                delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
                cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+                wdlast = cs->wd_last; /* save these in case we print them */
+                cslast = cs->cs_last;
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
                /* Check the deviation from the watchdog clocksource. */
                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-                        clocksource_unstable(cs, cs_nsec - wd_nsec);
+                        pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+                        pr_warn("       '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+                                watchdog->name, wdnow, wdlast, watchdog->mask);
+                        pr_warn("       '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+                                cs->name, csnow, cslast, cs->mask);
+                        __clocksource_unstable(cs);
                        continue;
                }
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 * @shift:      cycle to nanosecond divisor (power of two)
 * @maxadj:     maximum adjustment value to mult (~11%)
 * @mask:       bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:    maximum cycle value before potential overflow (does not include
+ *              any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
 */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
        u64 max_nsecs, max_cycles;
        /*
         * Calculate the maximum number of cycles that we can pass to the
-         * cyc2ns function without overflowing a 64-bit signed result. The
+         * cyc2ns() function without overflowing a 64-bit result.
-         * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-         * which is equivalent to the below.
-         * max_cycles < (2^63)/(mult + maxadj)
-         * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-         * max_cycles < 2^(63 - log2(mult + maxadj))
-         * max_cycles < 1 << (63 - log2(mult + maxadj))
-         * Please note that we add 1 to the result of the log2 to account for
-         * any rounding errors, ensure the above inequality is satisfied and
-         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+        max_cycles = ULLONG_MAX;
+        do_div(max_cycles, mult+maxadj);
        /*
         * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
        max_cycles = min(max_cycles, mask);
        max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+        /* return the max_cycles value as well if requested */
+        if (max_cyc)
+                *max_cyc = max_cycles;
+        /* Return 50% of the actual maximum, so we can detect bad values */
+        max_nsecs >>= 1;
        return max_nsecs;
 }
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
- * @cs:         Pointer to clocksource
+ * @cs:         Pointer to clocksource to be updated
 *
 */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-        u64 max_nsecs;
+        cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+                                                cs->maxadj, cs->mask,
-        max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+                                                &cs->max_cycles);
-                                          cs->mask);
-        /*
-         * To ensure that the clocksource does not wrap whilst we are idle,
-         * limit the time the clocksource can be deferred by 12.5%. Please
-         * note a margin of 12.5% is used because this can be computed with
-         * a shift, versus say 10% which would require division.
-         */
-        return max_nsecs - (max_nsecs >> 3);
 }
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }
 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
 * @cs:         clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
 * This should only be called from the clocksource->enable() method.
 *
 * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
 */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        u64 sec;
        /*
-         * Calc the maximum number of seconds which we can run before
+         * Default clocksources are *special* and self-define their mult/shift.
-         * wrapping around. For clocksources which have a mask > 32bit
+         * But, you're not special, so you should specify a freq value.
-         * we need to limit the max sleep time to have a good
-         * conversion precision. 10 minutes is still a reasonable
-         * amount. That results in a shift value of 24 for a
-         * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-         * margin as we do in clocksource_max_deferment()
         */
-        sec = (cs->mask - (cs->mask >> 3));
+        if (freq) {
-        do_div(sec, freq);
+                /*
-        do_div(sec, scale);
+                 * Calc the maximum number of seconds which we can run before
-        if (!sec)
+                 * wrapping around. For clocksources which have a mask > 32-bit
-                sec = 1;
+                 * we need to limit the max sleep time to have a good
-        else if (sec > 600 && cs->mask > UINT_MAX)
+                 * conversion precision. 10 minutes is still a reasonable
-                sec = 600;
+                 * amount. That results in a shift value of 24 for a
+                 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
-        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                 * ~ 0.06ppm granularity for NTP.
-                               NSEC_PER_SEC / scale, sec * scale);
+                 */
+                sec = cs->mask;
+                do_div(sec, freq);
+                do_div(sec, scale);
+                if (!sec)
+                        sec = 1;
+                else if (sec > 600 && cs->mask > UINT_MAX)
+                        sec = 600;
+                clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                                       NSEC_PER_SEC / scale, sec * scale);
+        }
        /*
-         * for clocksources that have large mults, to avoid overflow.
+         * Ensure clocksources that have large 'mult' values don't overflow
-         * Since mult may be adjusted by ntp, add an safety extra margin
+         * when adjusted.
-         *
         */
        cs->maxadj = clocksource_max_adjustment(cs);
-        while ((cs->mult + cs->maxadj < cs->mult)
+        while (freq && ((cs->mult + cs->maxadj < cs->mult)
-                || (cs->mult - cs->maxadj > cs->mult)) {
+                || (cs->mult - cs->maxadj > cs->mult))) {
                cs->mult >>= 1;
                cs->shift--;
                cs->maxadj = clocksource_max_adjustment(cs);
        }
-        cs->max_idle_ns = clocksource_max_deferment(cs);
+        /*
+         * Only warn for *special* clocksources that self-define
+         * their mult/shift values and don't specify a freq.
+         */
+        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+                "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+                cs->name);
+        clocksource_update_max_deferment(cs);
+        pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+                        cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 /**
 * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        /* Initialize mult/shift and max_idle_ns */
-        __clocksource_updatefreq_scale(cs, scale, freq);
+        __clocksource_update_freq_scale(cs, scale, freq);
        /* Add clocksource to the clocksource list */
        mutex_lock(&clocksource_mutex);
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:         clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-        /* calculate max adjustment for given mult/shift */
-        cs->maxadj = clocksource_max_adjustment(cs);
-        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-                "Clocksource %s might overflow on 11%% adjustment\n",
-                cs->name);
-        /* calculate max idle time permitted for this clocksource */
-        cs->max_idle_ns = clocksource_max_deferment(cs);
-        mutex_lock(&clocksource_mutex);
-        clocksource_enqueue(cs);
-        clocksource_enqueue_watchdog(cs);
-        clocksource_select();
-        mutex_unlock(&clocksource_mutex);
-        return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
        list_del(&cs->list);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f78091..76d4bd962b19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
 #include <trace/events/timer.h>
-#include "timekeeping.h"
+#include "tick-internal.h"
 /*
 * The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DYING:
-        case CPU_DYING_FROZEN:
-                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
-                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-        {
-                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
                migrate_hrtimers(scpu);
                break;
-        }
 #endif
        default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
 /* The Jiffies based clocksource is the lowest common
 * denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
        .shift          = JIFFIES_SHIFT,
+        .max_cycles     = 10,
 };
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
 static int __init init_jiffies_clocksource(void)
 {
-        return clocksource_register(&clocksource_jiffies);
+        return __clocksource_register(&clocksource_jiffies);
 }
 core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
-        clocksource_register(&refined_jiffies);
+        __clocksource_register(&refined_jiffies);
        return 0;
 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08a4f07..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
-#include "tick-internal.h"
 #include "ntp_internal.h"
 /*
@@ -459,6 +458,16 @@ out:
        return leap;
 }
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+        struct timespec now;
+        now = timespec64_to_timespec(now64);
+        return update_persistent_clock(now);
+}
+#endif
 #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
                if (persistent_clock_is_local)
                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-                fail = update_persistent_clock(timespec64_to_timespec(adjust));
+                fail = update_persistent_clock64(adjust);
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
                if (fail == -ENODEV)
                        fail = rtc_set_ntp_time(adjust);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
-struct clock_data {
+/**
-        ktime_t wrap_kt;
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:           sched_clock() value at last update
+ * @epoch_cyc:          Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *                      clocks.
+ * @read_sched_clock:   Current clock source (or dummy source when suspended).
+ * @mult:               Multipler for scaled math conversion.
+ * @shift:              Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
        u64 epoch_ns;
        u64 epoch_cyc;
-        seqcount_t seq;
+        u64 sched_clock_mask;
-        unsigned long rate;
+        u64 (*read_sched_clock)(void);
        u32 mult;
        u32 shift;
-        bool suspended;
+};
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ *                     registration of a new clock source)
+ *
+ * @seq:                Sequence counter for protecting updates. The lowest
+ *                      bit is the index for @read_data.
+ * @read_data:          Data required to read from sched_clock.
+ * @wrap_kt:            Duration for which clock can run before wrapping.
+ * @rate:               Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+        seqcount_t              seq;
+        struct clock_read_data  read_data[2];
+        ktime_t                 wrap_kt;
+        unsigned long           rate;
+        u64 (*actual_read_sched_clock)(void);
 };
 static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
 core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
-        .mult   = NSEC_PER_SEC / HZ,
-};
-static u64 __read_mostly sched_clock_mask;
 static u64 notrace jiffy_sched_clock_read(void)
 {
        /*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
        return (u64)(jiffies - INITIAL_JIFFIES);
 }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+        .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+                          .read_sched_clock = jiffy_sched_clock_read, },
+        .actual_read_sched_clock = jiffy_sched_clock_read,
+};
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 unsigned long long notrace sched_clock(void)
 {
-        u64 epoch_ns;
+        u64 cyc, res;
-        u64 epoch_cyc;
-        u64 cyc;
        unsigned long seq;
+        struct clock_read_data *rd;
-        if (cd.suspended)
-                return cd.epoch_ns;
        do {
-                seq = raw_read_seqcount_begin(&cd.seq);
+                seq = raw_read_seqcount(&cd.seq);
-                epoch_cyc = cd.epoch_cyc;
+                rd = cd.read_data + (seq & 1);
-                epoch_ns = cd.epoch_ns;
+                cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+                      rd->sched_clock_mask;
+                res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
        } while (read_seqcount_retry(&cd.seq, seq));
-        cyc = read_sched_clock();
+        return res;
-        cyc = (cyc - epoch_cyc) & sched_clock_mask;
+}
-        return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+        /* update the backup (odd) copy with the new data */
+        cd.read_data[1] = *rd;
+        /* steer readers towards the odd copy */
+        raw_write_seqcount_latch(&cd.seq);
+        /* now its safe for us to update the normal (even) copy */
+        cd.read_data[0] = *rd;
+        /* switch readers back to the even copy */
+        raw_write_seqcount_latch(&cd.seq);
 }
 /*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
 */
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
 {
-        unsigned long flags;
        u64 cyc;
        u64 ns;
+        struct clock_read_data rd;
+        rd = cd.read_data[0];
+        cyc = cd.actual_read_sched_clock();
+        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+        rd.epoch_ns = ns;
+        rd.epoch_cyc = cyc;
-        cyc = read_sched_clock();
+        update_clock_read_data(&rd);
-        ns = cd.epoch_ns +
-                cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                          cd.mult, cd.shift);
-        raw_local_irq_save(flags);
-        raw_write_seqcount_begin(&cd.seq);
-        cd.epoch_ns = ns;
-        cd.epoch_cyc = cyc;
-        raw_write_seqcount_end(&cd.seq);
-        raw_local_irq_restore(flags);
 }
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
        update_sched_clock();
        hrtimer_forward_now(hrt, cd.wrap_kt);
        return HRTIMER_RESTART;
 }
-void __init sched_clock_register(u64 (*read)(void), int bits,
+void __init
-                                 unsigned long rate)
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
        u64 res, wrap, new_mask, new_epoch, cyc, ns;
        u32 new_mult, new_shift;
-        ktime_t new_wrap_kt;
        unsigned long r;
        char r_unit;
+        struct clock_read_data rd;
        if (cd.rate > rate)
                return;
        WARN_ON(!irqs_disabled());
-        /* calculate the mult/shift to convert counter ticks to ns. */
+        /* Calculate the mult/shift to convert counter ticks to ns. */
        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
        new_mask = CLOCKSOURCE_MASK(bits);
+        cd.rate = rate;
+        /* Calculate how many nanosecs until we risk wrapping */
+        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+        cd.wrap_kt = ns_to_ktime(wrap);
-        /* calculate how many ns until we wrap */
+        rd = cd.read_data[0];
-        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-        new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-        /* update epoch for new counter and update epoch_ns from old counter*/
+        /* Update epoch for new counter and update 'epoch_ns' from old counter*/
        new_epoch = read();
-        cyc = read_sched_clock();
+        cyc = cd.actual_read_sched_clock();
-        ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
-                          cd.mult, cd.shift);
+        cd.actual_read_sched_clock = read;
-        raw_write_seqcount_begin(&cd.seq);
+        rd.read_sched_clock     = read;
-        read_sched_clock = read;
+        rd.sched_clock_mask     = new_mask;
-        sched_clock_mask = new_mask;
+        rd.mult                 = new_mult;
-        cd.rate = rate;
+        rd.shift                = new_shift;
-        cd.wrap_kt = new_wrap_kt;
+        rd.epoch_cyc            = new_epoch;
-        cd.mult = new_mult;
+        rd.epoch_ns             = ns;
-        cd.shift = new_shift;
-        cd.epoch_cyc = new_epoch;
+        update_clock_read_data(&rd);
-        cd.epoch_ns = ns;
-        raw_write_seqcount_end(&cd.seq);
        r = rate;
        if (r >= 4000000) {
                r /= 1000000;
                r_unit = 'M';
-        } else if (r >= 1000) {
+        } else {
-                r /= 1000;
+                if (r >= 1000) {
-                r_unit = 'k';
+                        r /= 1000;
-        } else
+                        r_unit = 'k';
-                r_unit = ' ';
+                } else {
+                        r_unit = ' ';
-        /* calculate the ns resolution of this counter */
+                }
+        }
+        /* Calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, new_mult, new_shift);
        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
                bits, r, r_unit, res, wrap);
-        /* Enable IRQ time accounting if we have a fast enough sched_clock */
+        /* Enable IRQ time accounting if we have a fast enough sched_clock() */
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
        /*
-         * If no sched_clock function has been provided at that point,
+         * If no sched_clock() function has been provided at that point,
         * make it the final one one.
         */
-        if (read_sched_clock == jiffy_sched_clock_read)
+        if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
        update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+        unsigned long seq = raw_read_seqcount(&cd.seq);
+        return cd.read_data[seq & 1].epoch_cyc;
+}
 static int sched_clock_suspend(void)
 {
+        struct clock_read_data *rd = &cd.read_data[0];
        update_sched_clock();
        hrtimer_cancel(&sched_clock_timer);
-        cd.suspended = true;
+        rd->read_sched_clock = suspended_sched_clock_read;
        return 0;
 }
 static void sched_clock_resume(void)
 {
-        cd.epoch_cyc = read_sched_clock();
+        struct clock_read_data *rd = &cd.read_data[0];
+        rd->epoch_cyc = cd.actual_read_sched_clock();
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-        cd.suspended = false;
+        rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 static struct syscore_ops sched_clock_ops = {
-        .suspend = sched_clock_suspend,
+        .suspend        = sched_clock_suspend,
-        .resume = sched_clock_resume,
+        .resume         = sched_clock_resume,
 };
 static int __init sched_clock_syscore_init(void)
 {
        register_syscore_ops(&sched_clock_ops);
        return 0;
 }
 device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 */
 static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 {
+        int bc_moved;
        /*
         * We try to cancel the timer first. If the callback is on
         * flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
         * restart the timer because we are in the callback, but we
         * can set the expiry time and let the callback return
         * HRTIMER_RESTART.
+         *
+         * Since we are in the idle loop at this point and because
+         * hrtimer_{start/cancel} functions call into tracing,
+         * calls to these functions must be bound within RCU_NONIDLE.
         */
-        if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+        RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-                hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+                !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+                        0);
+        if (bc_moved) {
                /* Bind the "device" to the cpu */
                bc->bound_on = smp_processor_id();
        } else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
 static cpumask_var_t tick_broadcast_on;
 static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 #else
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
 #endif
 /*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * The device is in periodic mode. No reprogramming necessary:
         */
-        if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+        if (dev->state == CLOCK_EVT_STATE_PERIODIC)
                goto unlock;
        /*
@@ -324,49 +326,54 @@ unlock:
        raw_spin_unlock(&tick_broadcast_lock);
 }
-/*
+/**
- * Powerstate information: The system enters/leaves a state, where
+ * tick_broadcast_control - Enable/disable or force broadcast mode
- * affected devices might stop
+ * @mode:       The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
 */
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-        unsigned long flags;
        int cpu, bc_stopped;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        td = this_cpu_ptr(&tick_cpu_device);
-        cpu = smp_processor_id();
-        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
-        bc = tick_broadcast_device.evtdev;
        /*
         * Is the device not affected by the powerstate ?
         */
        if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
-                goto out;
+                return;
        if (!tick_device_is_functional(dev))
-                goto out;
+                return;
+        raw_spin_lock(&tick_broadcast_lock);
+        cpu = smp_processor_id();
+        bc = tick_broadcast_device.evtdev;
        bc_stopped = cpumask_empty(tick_broadcast_mask);
-        switch (*reason) {
+        switch (mode) {
-        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        case TICK_BROADCAST_FORCE:
-        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+                tick_broadcast_forced = 1;
+        case TICK_BROADCAST_ON:
                cpumask_set_cpu(cpu, tick_broadcast_on);
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                clockevents_shutdown(dev);
                }
-                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                        tick_broadcast_force = 1;
                break;
-        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-                if (tick_broadcast_force)
+        case TICK_BROADCAST_OFF:
+                if (tick_broadcast_forced)
                        break;
                cpumask_clear_cpu(cpu, tick_broadcast_on);
                if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                else
                        tick_broadcast_setup_oneshot(bc);
        }
-out:
+        raw_spin_unlock(&tick_broadcast_lock);
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
-        if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
-                printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
-                       "offline CPU #%d\n", *oncpu);
-        else
-                tick_do_broadcast_on_off(&reason);
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
 /*
 * Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
                dev->event_handler = tick_handle_periodic_broadcast;
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Remove a CPU from broadcasting
 */
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-        unsigned int cpu = *cpup;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 void tick_suspend_broadcast(void)
 {
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+        if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+                return false;
+        else
+                return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+void tick_resume_broadcast(void)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-        int broadcast = 0;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        if (bc) {
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+                clockevents_tick_resume(bc);
                switch (tick_broadcast_device.mode) {
                case TICKDEV_MODE_PERIODIC:
                        if (!cpumask_empty(tick_broadcast_mask))
                                tick_broadcast_start_periodic(bc);
-                        broadcast = cpumask_test_cpu(smp_processor_id(),
-                                                     tick_broadcast_mask);
                        break;
                case TICKDEV_MODE_ONESHOT:
                        if (!cpumask_empty(tick_broadcast_mask))
-                                broadcast = tick_resume_broadcast_oneshot(bc);
+                                tick_resume_broadcast_oneshot(bc);
                        break;
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-        return broadcast;
 }
 #ifdef CONFIG_TICK_ONESHOT
 static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 {
        int ret;
-        if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+        if (bc->state != CLOCK_EVT_STATE_ONESHOT)
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
        ret = clockevents_program_event(bc, expires, force);
        if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
        return ret;
 }
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
-        return 0;
 }
 /*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
                 * switched over, leave the device alone.
                 */
                if (td->mode == TICKDEV_MODE_ONESHOT) {
-                        clockevents_set_mode(td->evtdev,
+                        clockevents_set_state(td->evtdev,
-                                             CLOCK_EVT_MODE_ONESHOT);
+                                              CLOCK_EVT_STATE_ONESHOT);
                }
        }
 }
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
                if (dev->next_event.tv64 < bc->next_event.tv64)
                        return;
        }
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
-static void broadcast_move_bc(int deadcpu)
+/**
-{
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
-        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ * @state:      The target state (enter/exit)
+ *
-        if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+ * The system enters/leaves a state, where affected devices might stop
-                return;
-        /* This moves the broadcast assignment to this cpu */
-        clockevents_program_event(bc, bc->next_event, 1);
-}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
 */
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-        unsigned long flags;
-        ktime_t now;
        int cpu, ret = 0;
+        ktime_t now;
        /*
         * Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
         * We are called with preemtion disabled from the depth of the
         * idle code, so we can't be moved away.
         */
-        cpu = smp_processor_id();
+        td = this_cpu_ptr(&tick_cpu_device);
-        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
+        raw_spin_lock(&tick_broadcast_lock);
        bc = tick_broadcast_device.evtdev;
+        cpu = smp_processor_id();
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        if (state == TICK_BROADCAST_ENTER) {
-        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
                        broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
                        /*
                         * The cpu which was handling the broadcast
                         * timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                }
        }
 out:
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock(&tick_broadcast_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 /*
 * Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
        /* Set it up only once ! */
        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+                int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
                bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                           tick_broadcast_oneshot_mask, tmpmask);
                if (was_periodic && !cpumask_empty(tmpmask)) {
-                        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                        clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
                        tick_broadcast_init_next_event(tmpmask,
                                                       tick_next_period);
                        tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+        struct clock_event_device *bc;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        bc = tick_broadcast_device.evtdev;
+        if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+                /* This moves the broadcast assignment to this CPU: */
+                clockevents_program_event(bc, bc->next_event, 1);
+        }
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
 /*
 * Remove a dead CPU from broadcasting
 */
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
 {
        unsigned long flags;
-        unsigned int cpu = *cpup;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-        broadcast_move_bc(cpu);
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 /*
 * Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c515595b42..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
        tick_periodic(cpu);
-        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+        if (dev->state != CLOCK_EVT_STATE_ONESHOT)
                return;
        for (;;) {
                /*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
            !tick_broadcast_oneshot_active()) {
-                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+                clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
        } else {
                unsigned long seq;
                ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                        next = tick_next_period;
                } while (read_seqretry(&jiffies_lock, seq));
-                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
                for (;;) {
                        if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
        tick_install_broadcast_device(newdev);
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Transfer the do_timer job away from a dying cpu.
 *
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
 */
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
 {
-        if (*cpup == tick_do_timer_cpu) {
+        if (tick_do_timer_cpu == smp_processor_id()) {
                int cpu = cpumask_first(cpu_online_mask);
                tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
 * access the hardware device itself.
 * We just set the mode and remove it from the lists.
 */
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
 {
-        struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+        struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
        struct clock_event_device *dev = td->evtdev;
        td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
                 * Prevent that the clock events layer tries to call
                 * the set mode function!
                 */
+                dev->state = CLOCK_EVT_STATE_DETACHED;
                dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
        }
 }
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        clockevents_shutdown(td->evtdev);
 }
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
-        int broadcast = tick_resume_broadcast();
+        bool broadcast = tick_resume_check_broadcast();
-        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+        clockevents_tick_resume(td->evtdev);
        if (!broadcast) {
                if (td->mode == TICKDEV_MODE_PERIODIC)
                        tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
        }
 }
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+        tick_suspend_local();
+        tick_suspend_broadcast();
+}
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+        tick_resume_broadcast();
+        tick_resume_local();
+}
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
@@ -411,12 +457,10 @@ void tick_freeze(void)
        raw_spin_lock(&tick_freeze_lock);
        tick_freeze_depth++;
-        if (tick_freeze_depth == num_online_cpus()) {
+        if (tick_freeze_depth == num_online_cpus())
                timekeeping_suspend();
-        } else {
+        else
-                tick_suspend();
+                tick_suspend_local();
-                tick_suspend_broadcast();
-        }
        raw_spin_unlock(&tick_freeze_lock);
 }
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
        if (tick_freeze_depth == num_online_cpus())
                timekeeping_resume();
        else
-                tick_resume();
+                tick_resume_local();
        tick_freeze_depth--;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
 #include <linux/tick.h>
 #include "timekeeping.h"
+#include "tick-sched.h"
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#define CS_NAME_LEN     32
+# define TICK_DO_TIMER_NONE     -1
+# define TICK_DO_TIMER_BOOT     -2
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-#define TICK_DO_TIMER_NONE      -1
-#define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
+extern void tick_shutdown(unsigned int cpu);
-extern void tick_shutdown(unsigned int *cpup);
 extern void tick_suspend(void);
 extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
                                   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+                                        struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+                                 enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+                                     ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
+/* Broadcasting support */
- * NO_HZ / high resolution timer shared code
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
- */
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+        dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+/* Oneshot related functions */
 #ifdef CONFIG_TICK_ONESHOT
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
 extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+                        void (*handler)(struct clock_event_device *),
+                        ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+#else /* !(BROADCAST && ONESHOT): */
-{
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
-        BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
-# endif /* !BROADCAST */
+#endif /* !(BROADCAST && ONESHOT) */
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
-                        void (*handler)(struct clock_event_device *),
-                        ktime_t nextevt)
-{
-        BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
-        BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
-        return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
-        BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
-        return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-#else /* !BROADCAST */
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
-        return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
-                                             int cpu)
-{
-        return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
-                                             u32 freq) { return -ENODEV; }
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
-                                             int broadcast)
-{
-        dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
-        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-#endif
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(dev, ktime_get(), true);
 }
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
                        ktime_t next_event)
 {
        newdev->event_handler = handler;
-        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(newdev, next_event, true);
 }
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
        td->mode = TICKDEV_MODE_ONESHOT;
        dev->event_handler = handler;
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        tick_broadcast_switch_to_oneshot();
        return 0;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4edac4528..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
 /*
 * Per cpu nohz control structure
 */
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 /*
 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
 __setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+        return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
 /**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+#include <linux/hrtimer.h>
+enum tick_device_mode {
+        TICKDEV_MODE_PERIODIC,
+        TICKDEV_MODE_ONESHOT,
+};
+struct tick_device {
+        struct clock_event_device *evtdev;
+        enum tick_device_mode mode;
+};
+enum tick_nohz_mode {
+        NOHZ_MODE_INACTIVE,
+        NOHZ_MODE_LOWRES,
+        NOHZ_MODE_HIGHRES,
+};
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer:        hrtimer to schedule the periodic tick in high
+ *                      resolution mode
+ * @last_tick:          Store the last tick expiry time when the tick
+ *                      timer is modified for nohz sleeps. This is necessary
+ *                      to resume the tick timer operation in the timeline
+ *                      when the CPU returns from nohz sleep.
+ * @tick_stopped:       Indicator that the idle tick has been stopped
+ * @idle_jiffies:       jiffies at the entry to idle for idle time accounting
+ * @idle_calls:         Total number of idle calls
+ * @idle_sleeps:        Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime:     Time when the idle call was entered
+ * @idle_waketime:      Time when the idle was interrupted
+ * @idle_exittime:      Time when the idle state was left
+ * @idle_sleeptime:     Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length:       Duration of the current idle sleep
+ * @do_timer_lst:       CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+        struct hrtimer                  sched_timer;
+        unsigned long                   check_clocks;
+        enum tick_nohz_mode             nohz_mode;
+        ktime_t                         last_tick;
+        int                             inidle;
+        int                             tick_stopped;
+        unsigned long                   idle_jiffies;
+        unsigned long                   idle_calls;
+        unsigned long                   idle_sleeps;
+        int                             idle_active;
+        ktime_t                         idle_entrytime;
+        ktime_t                         idle_waketime;
+        ktime_t                         idle_exittime;
+        ktime_t                         idle_sleeptime;
+        ktime_t                         iowait_sleeptime;
+        ktime_t                         sleep_length;
+        unsigned long                   last_jiffies;
+        unsigned long                   next_jiffies;
+        ktime_t                         idle_expires;
+        int                             do_timer_last;
+};
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
 };
 static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw  ____cacheline_aligned;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-        while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
+        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
-                tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
 }
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
        struct timespec64 ts;
        ts.tv_sec = tk->xtime_sec;
-        ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
 }
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
 }
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+        cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+        const char *name = tk->tkr_mono.clock->name;
+        if (offset > max_cycles) {
+                printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+                                offset, name, max_cycles);
+                printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+        } else {
+                if (offset > (max_cycles >> 1)) {
+                        printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+                                        offset, name, max_cycles >> 1);
+                        printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+                }
+        }
+        if (timekeeping_underflow_seen) {
+                if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                        printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                        printk_deferred("         Your kernel is probably still fine.\n");
+                        timekeeping_last_warning = jiffies;
+                }
+                timekeeping_underflow_seen = 0;
+        }
+        if (timekeeping_overflow_seen) {
+                if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                        printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                        printk_deferred("         Your kernel is probably still fine.\n");
+                        timekeeping_last_warning = jiffies;
+                }
+                timekeeping_overflow_seen = 0;
+        }
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+        cycle_t now, last, mask, max, delta;
+        unsigned int seq;
+        /*
+         * Since we're called holding a seqlock, the data may shift
+         * under us while we're doing the calculation. This can cause
+         * false positives, since we'd note a problem but throw the
+         * results away. So nest another seqlock here to atomically
+         * grab the points we are checking with.
+         */
+        do {
+                seq = read_seqcount_begin(&tk_core.seq);
+                now = tkr->read(tkr->clock);
+                last = tkr->cycle_last;
+                mask = tkr->mask;
+                max = tkr->clock->max_cycles;
+        } while (read_seqcount_retry(&tk_core.seq, seq));
+        delta = clocksource_delta(now, last, mask);
+        /*
+         * Try to catch underflows by checking if we are seeing small
+         * mask-relative negative values.
+         */
+        if (unlikely((~delta & mask) < (mask >> 3))) {
+                timekeeping_underflow_seen = 1;
+                delta = 0;
+        }
+        /* Cap delta value to the max_cycles values to avoid mult overflows */
+        if (unlikely(delta > max)) {
+                timekeeping_overflow_seen = 1;
+                delta = tkr->clock->max_cycles;
+        }
+        return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+        cycle_t cycle_now, delta;
+        /* read clocksource */
+        cycle_now = tkr->read(tkr->clock);
+        /* calculate the delta since the last update_wall_time */
+        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+        return delta;
+}
+#endif
 /**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;
-        old_clock = tk->tkr.clock;
+        old_clock = tk->tkr_mono.clock;
-        tk->tkr.clock = clock;
+        tk->tkr_mono.clock = clock;
-        tk->tkr.read = clock->read;
+        tk->tkr_mono.read = clock->read;
-        tk->tkr.mask = clock->mask;
+        tk->tkr_mono.mask = clock->mask;
-        tk->tkr.cycle_last = tk->tkr.read(clock);
+        tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+        tk->tkr_raw.clock = clock;
+        tk->tkr_raw.read = clock->read;
+        tk->tkr_raw.mask = clock->mask;
+        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0)
-                        tk->tkr.xtime_nsec >>= -shift_change;
+                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                else
-                        tk->tkr.xtime_nsec <<= shift_change;
+                        tk->tkr_mono.xtime_nsec <<= shift_change;
        }
-        tk->tkr.shift = clock->shift;
+        tk->tkr_raw.xtime_nsec = 0;
+        tk->tkr_mono.shift = clock->shift;
+        tk->tkr_raw.shift = clock->shift;
        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-        tk->tkr.mult = clock->mult;
+        tk->tkr_mono.mult = clock->mult;
+        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
 }
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-        cycle_t cycle_now, delta;
+        cycle_t delta;
        s64 nsec;
-        /* read clocksource: */
+        delta = timekeeping_get_delta(tkr);
-        cycle_now = tkr->read(tkr->clock);
-        /* calculate the delta since the last update_wall_time: */
-        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
        nsec = delta * tkr->mult + tkr->xtime_nsec;
        nsec >>= tkr->shift;
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
        return nsec + arch_gettimeoffset();
 }
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
-        struct clocksource *clock = tk->tkr.clock;
-        cycle_t cycle_now, delta;
-        s64 nsec;
-        /* read clocksource: */
-        cycle_now = tk->tkr.read(clock);
-        /* calculate the delta since the last update_wall_time: */
-        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-        /* convert delta to nanoseconds. */
-        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
-}
 /**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
 {
-        struct tk_read_base *base = tk_fast_mono.base;
+        struct tk_read_base *base = tkf->base;
        /* Force readers off to base[1] */
-        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        raw_write_seqcount_latch(&tkf->seq);
        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));
        /* Force readers back to base[0] */
-        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        raw_write_seqcount_latch(&tkf->seq);
        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;
        do {
-                seq = raw_read_seqcount(&tk_fast_mono.seq);
+                seq = raw_read_seqcount(&tkf->seq);
-                tkr = tk_fast_mono.base + (seq & 0x01);
+                tkr = tkf->base + (seq & 0x01);
-                now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+                now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+        } while (read_seqcount_retry(&tkf->seq, seq));
-        } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
        return now;
 }
+u64 ktime_get_mono_fast_ns(void)
+{
+        return __ktime_get_fast_ns(&tk_fast_mono);
+}
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+u64 ktime_get_raw_fast_ns(void)
+{
+        return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
        static struct tk_read_base tkr_dummy;
-        struct tk_read_base *tkr = &tk->tkr;
+        struct tk_read_base *tkr = &tk->tkr_mono;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tkr->read(tkr->clock);
        tkr_dummy.read = dummy_clock_read;
-        update_fast_timekeeper(&tkr_dummy);
+        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+        tkr = &tk->tkr_raw;
+        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+        tkr_dummy.read = dummy_clock_read;
+        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
        xt = timespec64_to_timespec(tk_xtime(tk));
        wm = timespec64_to_timespec(tk->wall_to_monotonic);
-        update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
+        update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
-                            tk->tkr.cycle_last);
+                            tk->tkr_mono.cycle_last);
 }
 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
        * users are removed, this can be killed.
        */
-        remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
+        remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-        tk->tkr.xtime_nsec -= remainder;
+        tk->tkr_mono.xtime_nsec -= remainder;
-        tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
-        tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-        tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
        /* Update the monotonic raw base */
-        tk->base_raw = timespec64_to_ktime(tk->raw_time);
+        tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
-        nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
-        update_fast_timekeeper(&tk->tkr);
+        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }
 /**
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-        struct clocksource *clock = tk->tkr.clock;
+        struct clocksource *clock = tk->tkr_mono.clock;
        cycle_t cycle_now, delta;
        s64 nsec;
-        cycle_now = tk->tkr.read(clock);
+        cycle_now = tk->tkr_mono.read(clock);
-        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
+        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
-        tk->tkr.cycle_last = cycle_now;
+        tk->tkr_mono.cycle_last = cycle_now;
+        tk->tkr_raw.cycle_last  = cycle_now;
-        tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+        tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
        /* If arch requires, add in get_arch_timeoffset() */
-        tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
-        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+        nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
        timespec64_add_ns(&tk->raw_time, nsec);
 }
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -577,8 +690,8 @@ ktime_t ktime_get(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = ktime_add(tk->tkr.base_mono, *offset);
+                base = ktime_add(tk->tkr_mono.base, *offset);
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->base_raw;
+                base = tk->tkr_raw.base;
-                nsecs = timekeeping_get_ns_raw(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr_raw);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-                nsec = timekeeping_get_ns(&tk->tkr);
+                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(tk);
+                nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
-                nsecs_real = timekeeping_get_ns(&tk->tkr);
+                nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data)
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0) {
-                        old = tk->tkr.clock;
+                        old = tk->tkr_mono.clock;
                        tk_setup_internals(tk, new);
                        if (old->disable)
                                old->disable(old);
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        if (tk->tkr.clock == clock)
+        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
-        return tk->tkr.clock == clock ? 0 : -1;
+        return tk->tkr_mono.clock == clock ? 0 : -1;
 }
 /**
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                nsecs = timekeeping_get_ns_raw(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr_raw);
                ts64 = tk->raw_time;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->tkr.clock->max_idle_ns;
+                ret = tk->tkr_mono.clock->max_idle_ns;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+        struct timespec ts;
+        read_persistent_clock(&ts);
+        *ts64 = timespec_to_timespec64(ts);
+}
 /**
 * read_boot_clock -  Return time of the system start.
 *
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+        struct timespec ts;
+        read_boot_clock(&ts);
+        *ts64 = timespec_to_timespec64(ts);
+}
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
 /*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void)
        struct clocksource *clock;
        unsigned long flags;
        struct timespec64 now, boot, tmp;
-        struct timespec ts;
-        read_persistent_clock(&ts);
+        read_persistent_clock64(&now);
-        now = timespec_to_timespec64(ts);
        if (!timespec64_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
                now.tv_nsec = 0;
        } else if (now.tv_sec || now.tv_nsec)
-                persistent_clock_exist = true;
+                persistent_clock_exists = true;
-        read_boot_clock(&ts);
+        read_boot_clock64(&boot);
-        boot = timespec_to_timespec64(ts);
        if (!timespec64_valid_strict(&boot)) {
                pr_warn("WARNING: Boot clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void)
        tk_set_xtime(tk, &now);
        tk->raw_time.tv_sec = 0;
        tk->raw_time.tv_nsec = 0;
-        tk->base_raw.tv64 = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
                boot = tk_xtime(tk);
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void)
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
 static struct timespec64 timekeeping_suspend_time;
 /**
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
        tk_debug_account_sleep_time(delta);
 }
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+        return sleeptime_injected;
+}
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+        return persistent_clock_exists;
+}
 /**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
-        /*
-         * Make sure we don't set the clock twice, as timekeeping_resume()
-         * already did it
-         */
-        if (has_persistent_clock())
-                return;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        /* signal hrtimers about time change */
        clock_was_set();
 }
+#endif
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
 */
 void timekeeping_resume(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        struct clocksource *clock = tk->tkr.clock;
+        struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
-        struct timespec tmp;
        cycle_t cycle_now, cycle_delta;
-        bool suspendtime_found = false;
-        read_persistent_clock(&tmp);
+        sleeptime_injected = false;
-        ts_new = timespec_to_timespec64(tmp);
+        read_persistent_clock64(&ts_new);
        clockevents_resume();
        clocksource_resume();
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void)
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
-        cycle_now = tk->tkr.read(clock);
+        cycle_now = tk->tkr_mono.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-                cycle_now > tk->tkr.cycle_last) {
+                cycle_now > tk->tkr_mono.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;
-                cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
+                cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
-                                                tk->tkr.mask);
+                                                tk->tkr_mono.mask);
                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void)
                nsec += ((u64) cycle_delta * mult) >> shift;
                ts_delta = ns_to_timespec64(nsec);
-                suspendtime_found = true;
+                sleeptime_injected = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
-                suspendtime_found = true;
+                sleeptime_injected = true;
        }
-        if (suspendtime_found)
+        if (sleeptime_injected)
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        /* Re-base the last cycle value */
-        tk->tkr.cycle_last = cycle_now;
+        tk->tkr_mono.cycle_last = cycle_now;
+        tk->tkr_raw.cycle_last  = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void)
        touch_softlockup_watchdog();
-        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+        tick_resume();
-        /* Resume hrtimers */
        hrtimers_resume();
 }
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void)
        unsigned long flags;
        struct timespec64               delta, delta_delta;
        static struct timespec64        old_delta;
-        struct timespec tmp;
-        read_persistent_clock(&tmp);
+        read_persistent_clock64(&timekeeping_suspend_time);
-        timekeeping_suspend_time = timespec_to_timespec64(tmp);
        /*
         * On some systems the persistent_clock can not be detected at
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void)
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
-                persistent_clock_exist = true;
+                persistent_clock_exists = true;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
-        /*
+        if (persistent_clock_exists) {
-         * To avoid drift caused by repeated suspend/resumes,
-         * which each can add ~1 second drift error,
-         * try to compensate so the difference in system time
-         * and persistent_clock time stays close to constant.
-         */
-        delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-        delta_delta = timespec64_sub(delta, old_delta);
-        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
-                 * if delta_delta is too large, assume time correction
+                 * To avoid drift caused by repeated suspend/resumes,
-                 * has occured and set old_delta to the current delta.
+                 * which each can add ~1 second drift error,
+                 * try to compensate so the difference in system time
+                 * and persistent_clock time stays close to constant.
                 */
-                old_delta = delta;
+                delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-        } else {
+                delta_delta = timespec64_sub(delta, old_delta);
-                /* Otherwise try to adjust old_system to compensate */
+                if (abs(delta_delta.tv_sec) >= 2) {
-                timekeeping_suspend_time =
+                        /*
-                        timespec64_add(timekeeping_suspend_time, delta_delta);
+                         * if delta_delta is too large, assume time correction
+                         * has occurred and set old_delta to the current delta.
+                         */
+                        old_delta = delta;
+                } else {
+                        /* Otherwise try to adjust old_system to compensate */
+                        timekeeping_suspend_time =
+                                timespec64_add(timekeeping_suspend_time, delta_delta);
+                }
        }
        timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void)
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
         *
         * XXX - TODO: Doc ntp_error calculation.
         */
-        if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }
-        tk->tkr.mult += mult_adj;
+        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
-        tk->tkr.xtime_nsec -= offset;
+        tk->tkr_mono.xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                tk->ntp_err_mult = 0;
        }
-        if (unlikely(tk->tkr.clock->maxadj &&
+        if (unlikely(tk->tkr_mono.clock->maxadj &&
-                (abs(tk->tkr.mult - tk->tkr.clock->mult)
+                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
-                        > tk->tkr.clock->maxadj))) {
+                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                        tk->tkr.clock->name, (long)tk->tkr.mult,
+                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
-                        (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }
        /*
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
         * We'll correct this error next time through this function, when
         * xtime_nsec is not as small.
         */
-        if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
+        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
-                s64 neg = -(s64)tk->tkr.xtime_nsec;
+                s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
-                tk->tkr.xtime_nsec = 0;
+                tk->tkr_mono.xtime_nsec = 0;
                tk->ntp_error += neg << tk->ntp_error_shift;
        }
 }
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;
-        while (tk->tkr.xtime_nsec >= nsecps) {
+        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;
-                tk->tkr.xtime_nsec -= nsecps;
+                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;
                /* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        /* Accumulate one shifted interval */
        offset -= interval;
-        tk->tkr.cycle_last += interval;
+        tk->tkr_mono.cycle_last += interval;
+        tk->tkr_raw.cycle_last  += interval;
-        tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);
        /* Accumulate raw time */
@@ -1622,14 +1779,17 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
        offset = real_tk->cycle_interval;
 #else
-        offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
+        offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
-                                   tk->tkr.cycle_last, tk->tkr.mask);
+                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif
        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval)
                goto out;
+        /* Do some additional sanity checking */
+        timekeeping_check_update(real_tk, offset);
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416055d5..ead8794b9a4e 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+extern seqlock_t jiffies_lock;
+#define CS_NAME_LEN     32
 #endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..2ece3aa5069c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
        struct tvec tv5;
 } ____cacheline_aligned;
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
 /**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
-        int j;
-        struct tvec_base *base;
-        static char tvec_base_done[NR_CPUS];
-        if (!tvec_base_done[cpu]) {
-                static char boot_done;
-                if (boot_done) {
-                        /*
-                         * The APs use this path later in boot
-                         */
-                        base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                            cpu_to_node(cpu));
-                        if (!base)
-                                return -ENOMEM;
-                        /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
-                        if (WARN_ON(base != tbase_get_base(base))) {
-                                kfree(base);
-                                return -ENOMEM;
-                        }
-                        per_cpu(tvec_bases, cpu) = base;
-                } else {
-                        /*
-                         * This is for the boot CPU - we use compile-time
-                         * static initialisation because per-cpu memory isn't
-                         * ready yet and because the memory allocators are not
-                         * initialised either.
-                         */
-                        boot_done = 1;
-                        base = &boot_tvec_bases;
-                }
-                spin_lock_init(&base->lock);
-                tvec_base_done[cpu] = 1;
-                base->cpu = cpu;
-        } else {
-                base = per_cpu(tvec_bases, cpu);
-        }
-        for (j = 0; j < TVN_SIZE; j++) {
-                INIT_LIST_HEAD(base->tv5.vec + j);
-                INIT_LIST_HEAD(base->tv4.vec + j);
-                INIT_LIST_HEAD(base->tv3.vec + j);
-                INIT_LIST_HEAD(base->tv2.vec + j);
-        }
-        for (j = 0; j < TVR_SIZE; j++)
-                INIT_LIST_HEAD(base->tv1.vec + j);
-        base->timer_jiffies = jiffies;
-        base->next_timer = base->timer_jiffies;
-        base->active_timers = 0;
-        base->all_timers = 0;
-        return 0;
-}
 #ifdef CONFIG_HOTPLUG_CPU
 static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
 {
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
+        old_base->active_timers = 0;
+        old_base->all_timers = 0;
        spin_unlock(&old_base->lock);
        spin_unlock_irq(&new_base->lock);
        put_cpu_var(tvec_bases);
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
-        long cpu = (long)hcpu;
+        switch (action) {
-        int err;
-        switch(action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                err = init_timers_cpu(cpu);
-                if (err < 0)
-                        return notifier_from_errno(err);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                migrate_timers(cpu);
+                migrate_timers((long)hcpu);
                break;
-#endif
        default:
                break;
        }
        return NOTIFY_OK;
 }
-static struct notifier_block timers_nb = {
+static inline void timer_register_cpu_notifier(void)
-        .notifier_call  = timer_cpu_notify,
+{
-};
+        cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+        int j;
-void __init init_timers(void)
+        BUG_ON(base != tbase_get_base(base));
+        base->cpu = cpu;
+        per_cpu(tvec_bases, cpu) = base;
+        spin_lock_init(&base->lock);
+        for (j = 0; j < TVN_SIZE; j++) {
+                INIT_LIST_HEAD(base->tv5.vec + j);
+                INIT_LIST_HEAD(base->tv4.vec + j);
+                INIT_LIST_HEAD(base->tv3.vec + j);
+                INIT_LIST_HEAD(base->tv2.vec + j);
+        }
+        for (j = 0; j < TVR_SIZE; j++)
+                INIT_LIST_HEAD(base->tv1.vec + j);
+        base->timer_jiffies = jiffies;
+        base->next_timer = base->timer_jiffies;
+}
+static void __init init_timer_cpus(void)
 {
-        int err;
+        struct tvec_base *base;
+        int local_cpu = smp_processor_id();
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                if (cpu == local_cpu)
+                        base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+                else
+                        base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+                init_timer_cpu(base, cpu);
+        }
+}
+void __init init_timers(void)
+{
        /* ensure there are enough low bits for flags in timer->base pointer */
        BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-        err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+        init_timer_cpus();
-                               (void *)(long)smp_processor_id());
-        BUG_ON(err != NOTIFY_OK);
        init_timer_stats();
-        register_cpu_notifier(&timers_nb);
+        timer_register_cpu_notifier();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
-#include <linux/tick.h>
 #include <asm/uaccess.h>
+#include "tick-internal.h"
 struct timer_list_iter {
        int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
-        SEQ_printf(m, " set_mode:       ");
+        if (dev->set_mode) {
-        print_name_offset(m, dev->set_mode);
+                SEQ_printf(m, " set_mode:       ");
-        SEQ_printf(m, "\n");
+                print_name_offset(m, dev->set_mode);
+                SEQ_printf(m, "\n");
+        } else {
+                if (dev->set_state_shutdown) {
+                        SEQ_printf(m, " shutdown: ");
+                        print_name_offset(m, dev->set_state_shutdown);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->set_state_periodic) {
+                        SEQ_printf(m, " periodic: ");
+                        print_name_offset(m, dev->set_state_periodic);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->set_state_oneshot) {
+                        SEQ_printf(m, " oneshot:  ");
+                        print_name_offset(m, dev->set_state_oneshot);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->tick_resume) {
+                        SEQ_printf(m, " resume:   ");
+                        print_name_offset(m, dev->tick_resume);
+                        SEQ_printf(m, "\n");
+                }
+        }
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..fedbdd7d5d1e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -599,6 +599,34 @@ config RING_BUFFER_STARTUP_TEST
         If unsure, say N
+config TRACE_ENUM_MAP_FILE
+       bool "Show enum mappings for trace events"
+       depends on TRACING
+       help
+        The "print fmt" of the trace events will show the enum names instead
+        of their values. This can cause problems for user space tools that
+        use this string to parse the raw data as user space does not know
+        how to convert the string to its value.
+        To fix this, there's a special macro in the kernel that can be used
+        to convert the enum into its value. If this macro is used, then the
+        print fmt strings will have the enums converted to their values.
+        If something does not get converted properly, this option can be
+        used to show what enums the kernel tried to convert.
+        This option is for debugging the enum conversions. A file is created
+        in the tracing directory called "enum_map" that will show the enum
+        names matched with their values and what trace event system they
+        belong too.
+        Normally, the mapping of the strings to values will be freed after
+        boot up or module load. With this option, they will not be freed, as
+        they are needed for the "enum_map" file. Enabling this option will
+        increase the memory footprint of the running kernel.
+        If unsure, say N
 endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..02bece4a99ea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
 static inline void update_function_graph_func(void) { }
 #endif
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+        /*
+         * If this is a dynamic ops or we force list func,
+         * then it needs to call the list anyway.
+         */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+                return ftrace_ops_list_func;
+        return ftrace_ops_get_func(ops);
+}
 static void update_ftrace_function(void)
 {
        ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
         * then have the mcount trampoline call the function directly.
         */
        } else if (ftrace_ops_list->next == &ftrace_list_end) {
-                func = ftrace_ops_get_func(ftrace_ops_list);
+                func = ftrace_ops_get_list_func(ftrace_ops_list);
        } else {
                /* Just use the default ftrace_ops */
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
        .stat_show      = function_stat_show
 };
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
        struct ftrace_profile_stat *stat;
        struct dentry *entry;
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
                }
        }
-        entry = debugfs_create_file("function_profile_enabled", 0644,
+        entry = tracefs_create_file("function_profile_enabled", 0644,
                                    d_tracer, NULL, &ftrace_profile_fops);
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'function_profile_enabled' entry\n");
 }
 #else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
        mutex_unlock(&ftrace_lock);
 }
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 {
        trace_create_file("available_filter_functions", 0444,
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void)
 }
 core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 static inline void ftrace_startup_all(int command) { }
 /* Keep as macros so we do not need to define the commands */
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
        /*
-         * If this is a dynamic ops or we force list func,
-         * then it needs to call the list anyway.
-         */
-        if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
-                return ftrace_ops_list_func;
-        /*
         * If the func handles its own recursion, call it directly.
         * Otherwise call the recursion protected function that
         * will call the ftrace ops function.
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = {
        .release        = ftrace_pid_release,
 };
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void)
        if (IS_ERR(d_tracer))
                return 0;
-        ftrace_init_dyn_debugfs(d_tracer);
+        ftrace_init_dyn_tracefs(d_tracer);
        trace_create_file("set_ftrace_pid", 0644, d_tracer,
                            NULL, &ftrace_pid_fops);
-        ftrace_profile_debugfs(d_tracer);
+        ftrace_profile_tracefs(d_tracer);
        return 0;
 }
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
 /**
 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
 static __always_inline int trace_recursive_lock(void)
 {
-        unsigned int val = this_cpu_read(current_context);
+        unsigned int val = __this_cpu_read(current_context);
        int bit;
        if (in_interrupt()) {
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
                return 1;
        val |= (1 << bit);
-        this_cpu_write(current_context, val);
+        __this_cpu_write(current_context, val);
        return 0;
 }
 static __always_inline void trace_recursive_unlock(void)
 {
-        unsigned int val = this_cpu_read(current_context);
+        __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
-        val--;
-        val &= this_cpu_read(current_context);
-        this_cpu_write(current_context, val);
 }
 #else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..91eecaaa43e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
@@ -31,6 +32,7 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/mount.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
 /* When set, tracing will stop when a WARN*() is hit */
 int __disable_trace_on_warning;
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+        struct module                   *mod;
+        unsigned long                   length;
+};
+union trace_enum_map_item;
+struct trace_enum_map_tail {
+        /*
+         * "end" is first and points to NULL as it must be different
+         * than "mod" or "enum_string"
+         */
+        union trace_enum_map_item       *next;
+        const char                      *end;   /* points to NULL */
+};
+static DEFINE_MUTEX(trace_enum_mutex);
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+        struct trace_enum_map           map;
+        struct trace_enum_map_head      head;
+        struct trace_enum_map_tail      tail;
+};
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
 #define MAX_TRACER_SIZE         100
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
        .write          = tracing_saved_cmdlines_size_write,
 };
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+        if (!ptr->map.enum_string) {
+                if (ptr->tail.next) {
+                        ptr = ptr->tail.next;
+                        /* Set ptr to the next real item (skip head) */
+                        ptr++;
+                } else
+                        return NULL;
+        }
+        return ptr;
+}
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        union trace_enum_map_item *ptr = v;
+        /*
+         * Paranoid! If ptr points to end, we don't want to increment past it.
+         * This really should never happen.
+         */
+        ptr = update_enum_map(ptr);
+        if (WARN_ON_ONCE(!ptr))
+                return NULL;
+        ptr++;
+        (*pos)++;
+        ptr = update_enum_map(ptr);
+        return ptr;
+}
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+        union trace_enum_map_item *v;
+        loff_t l = 0;
+        mutex_lock(&trace_enum_mutex);
+        v = trace_enum_maps;
+        if (v)
+                v++;
+        while (v && l < *pos) {
+                v = enum_map_next(m, v, &l);
+        }
+        return v;
+}
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&trace_enum_mutex);
+}
+static int enum_map_show(struct seq_file *m, void *v)
+{
+        union trace_enum_map_item *ptr = v;
+        seq_printf(m, "%s %ld (%s)\n",
+                   ptr->map.enum_string, ptr->map.enum_value,
+                   ptr->map.system);
+        return 0;
+}
+static const struct seq_operations tracing_enum_map_seq_ops = {
+        .start          = enum_map_start,
+        .next           = enum_map_next,
+        .stop           = enum_map_stop,
+        .show           = enum_map_show,
+};
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+static const struct file_operations tracing_enum_map_fops = {
+        .open           = tracing_enum_map_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+        /* Return tail of array given the head */
+        return ptr + ptr->head.length + 1;
+}
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+                           int len)
+{
+        struct trace_enum_map **stop;
+        struct trace_enum_map **map;
+        union trace_enum_map_item *map_array;
+        union trace_enum_map_item *ptr;
+        stop = start + len;
+        /*
+         * The trace_enum_maps contains the map plus a head and tail item,
+         * where the head holds the module and length of array, and the
+         * tail holds a pointer to the next list.
+         */
+        map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+        if (!map_array) {
+                pr_warning("Unable to allocate trace enum mapping\n");
+                return;
+        }
+        mutex_lock(&trace_enum_mutex);
+        if (!trace_enum_maps)
+                trace_enum_maps = map_array;
+        else {
+                ptr = trace_enum_maps;
+                for (;;) {
+                        ptr = trace_enum_jmp_to_tail(ptr);
+                        if (!ptr->tail.next)
+                                break;
+                        ptr = ptr->tail.next;
+                }
+                ptr->tail.next = map_array;
+        }
+        map_array->head.mod = mod;
+        map_array->head.length = len;
+        map_array++;
+        for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+                map_array->map = **map;
+                map_array++;
+        }
+        memset(map_array, 0, sizeof(*map_array));
+        mutex_unlock(&trace_enum_mutex);
+}
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+        trace_create_file("enum_map", 0444, d_tracer,
+                          NULL, &tracing_enum_map_fops);
+}
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+                              struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+static void trace_insert_enum_map(struct module *mod,
+                                  struct trace_enum_map **start, int len)
+{
+        struct trace_enum_map **map;
+        if (len <= 0)
+                return;
+        map = start;
+        trace_event_enum_update(map, len);
+        trace_insert_enum_map_file(mod, start, len);
+}
 static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
        tr->current_trace = &nop_trace;
 }
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
 {
        static struct trace_option_dentry *topts;
+        /* Only enable if the directory has been created already. */
+        if (!tr->dir)
+                return;
+        /* Currently, only the top instance has options */
+        if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+                return;
+        destroy_trace_option_files(topts);
+        topts = create_trace_option_files(tr, t);
+}
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
        struct tracer *t;
 #ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
                free_snapshot(tr);
        }
 #endif
-        /* Currently, only the top instance has options */
+        update_tracer_options(tr, t);
-        if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-                destroy_trace_option_files(topts);
-                topts = create_trace_option_files(tr, t);
-        }
 #ifdef CONFIG_TRACER_MAX_TRACE
        if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
 static struct dentry *tracing_get_dentry(struct trace_array *tr)
 {
+        if (WARN_ON(!tr->dir))
+                return ERR_PTR(-ENODEV);
+        /* Top directory uses NULL as the parent */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+                return NULL;
+        /* All sub buffers have a descriptor */
        return tr->dir;
 }
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
        if (IS_ERR(d_tracer))
                return NULL;
-        tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+        tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
        WARN_ONCE(!tr->percpu_dir,
-                  "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+                  "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
        return tr->percpu_dir;
 }
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
 }
 static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
                return;
        snprintf(cpu_dir, 30, "cpu%ld", cpu);
-        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+        d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
-                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+                pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
                return;
        }
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
 {
        struct dentry *ret;
-        ret = debugfs_create_file(name, mode, parent, data, fops);
+        ret = tracefs_create_file(name, mode, parent, data, fops);
        if (!ret)
-                pr_warning("Could not create debugfs '%s' entry\n", name);
+                pr_warning("Could not create tracefs '%s' entry\n", name);
        return ret;
 }
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
        if (IS_ERR(d_tracer))
                return NULL;
-        tr->options = debugfs_create_dir("options", d_tracer);
+        tr->options = tracefs_create_dir("options", d_tracer);
        if (!tr->options) {
-                pr_warning("Could not create debugfs directory 'options'\n");
+                pr_warning("Could not create tracefs directory 'options'\n");
                return NULL;
        }
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
                return;
        for (cnt = 0; topts[cnt].opt; cnt++)
-                debugfs_remove(topts[cnt].entry);
+                tracefs_remove(topts[cnt].entry);
        kfree(topts);
 }
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
 struct dentry *trace_instance_dir;
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 static int
 allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
 #endif
 }
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
 {
        struct trace_array *tr;
        int ret;
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name)
        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;
-        tr->dir = debugfs_create_dir(name, trace_instance_dir);
+        tr->dir = tracefs_create_dir(name, trace_instance_dir);
        if (!tr->dir)
                goto out_free_tr;
        ret = event_trace_add_tracer(tr->dir, tr);
        if (ret) {
-                debugfs_remove_recursive(tr->dir);
+                tracefs_remove_recursive(tr->dir);
                goto out_free_tr;
        }
-        init_tracer_debugfs(tr, tr->dir);
+        init_tracer_tracefs(tr, tr->dir);
        list_add(&tr->list, &ftrace_trace_arrays);
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name)
 }
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
 {
        struct trace_array *tr;
        int found = 0;
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name)
        return ret;
 }
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
-        struct dentry *parent;
-        int ret;
-        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-        if (WARN_ON_ONCE(parent != trace_instance_dir))
-                return -ENOENT;
-        /*
-         * The inode mutex is locked, but debugfs_create_dir() will also
-         * take the mutex. As the instances directory can not be destroyed
-         * or changed in any other way, it is safe to unlock it, and
-         * let the dentry try. If two users try to make the same dir at
-         * the same time, then the new_instance_create() will determine the
-         * winner.
-         */
-        mutex_unlock(&inode->i_mutex);
-        ret = new_instance_create(dentry->d_iname);
-        mutex_lock(&inode->i_mutex);
-        return ret;
-}
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
-        struct dentry *parent;
-        int ret;
-        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-        if (WARN_ON_ONCE(parent != trace_instance_dir))
-                return -ENOENT;
-        /* The caller did a dget() on dentry */
-        mutex_unlock(&dentry->d_inode->i_mutex);
-        /*
-         * The inode mutex is locked, but debugfs_create_dir() will also
-         * take the mutex. As the instances directory can not be destroyed
-         * or changed in any other way, it is safe to unlock it, and
-         * let the dentry try. If two users try to make the same dir at
-         * the same time, then the instance_delete() will determine the
-         * winner.
-         */
-        mutex_unlock(&inode->i_mutex);
-        ret = instance_delete(dentry->d_iname);
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-        mutex_lock(&dentry->d_inode->i_mutex);
-        return ret;
-}
-static const struct inode_operations instance_dir_inode_operations = {
-        .lookup         = simple_lookup,
-        .mkdir          = instance_mkdir,
-        .rmdir          = instance_rmdir,
-};
 static __init void create_trace_instances(struct dentry *d_tracer)
 {
-        trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+        trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+                                                         instance_mkdir,
+                                                         instance_rmdir);
        if (WARN_ON(!trace_instance_dir))
                return;
-        /* Hijack the dir inode operations, to allow mkdir */
-        trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
 }
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
        int cpu;
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 #endif
        for_each_tracing_cpu(cpu)
-                tracing_init_debugfs_percpu(tr, cpu);
+                tracing_init_tracefs_percpu(tr, cpu);
 }
+static struct vfsmount *trace_automount(void *ingore)
+{
+        struct vfsmount *mnt;
+        struct file_system_type *type;
+        /*
+         * To maintain backward compatibility for tools that mount
+         * debugfs to get to the tracing facility, tracefs is automatically
+         * mounted to the debugfs/tracing directory.
+         */
+        type = get_fs_type("tracefs");
+        if (!type)
+                return NULL;
+        mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+        put_filesystem(type);
+        if (IS_ERR(mnt))
+                return NULL;
+        mntget(mnt);
+        return mnt;
+}
 /**
 * tracing_init_dentry - initialize top level trace array
 *
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void)
 {
        struct trace_array *tr = &global_trace;
+        /* The top level trace array uses  NULL as parent */
        if (tr->dir)
-                return tr->dir;
+                return NULL;
        if (WARN_ON(!debugfs_initialized()))
                return ERR_PTR(-ENODEV);
-        tr->dir = debugfs_create_dir("tracing", NULL);
+        /*
+         * As there may still be users that expect the tracing
+         * files to exist in debugfs/tracing, we must automount
+         * the tracefs file system there, so older tools still
+         * work with the newer kerenl.
+         */
+        tr->dir = debugfs_create_automount("tracing", NULL,
+                                           trace_automount, NULL);
        if (!tr->dir) {
                pr_warn_once("Could not create debugfs directory 'tracing'\n");
                return ERR_PTR(-ENOMEM);
        }
-        return tr->dir;
+        return NULL;
+}
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+static void __init trace_enum_init(void)
+{
+        int len;
+        len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+        trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+        if (!mod->num_trace_enums)
+                return;
+        /*
+         * Modules with bad taint do not have events created, do
+         * not bother with enums either.
+         */
+        if (trace_module_has_bad_taint(mod))
+                return;
+        trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
 }
-static __init int tracer_init_debugfs(void)
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+        union trace_enum_map_item *map;
+        union trace_enum_map_item **last = &trace_enum_maps;
+        if (!mod->num_trace_enums)
+                return;
+        mutex_lock(&trace_enum_mutex);
+        map = trace_enum_maps;
+        while (map) {
+                if (map->head.mod == mod)
+                        break;
+                map = trace_enum_jmp_to_tail(map);
+                last = &map->tail.next;
+                map = map->tail.next;
+        }
+        if (!map)
+                goto out;
+        *last = trace_enum_jmp_to_tail(map)->tail.next;
+        kfree(map);
+ out:
+        mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static int trace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+        struct module *mod = data;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                trace_module_add_enums(mod);
+                break;
+        case MODULE_STATE_GOING:
+                trace_module_remove_enums(mod);
+                break;
+        }
+        return 0;
+}
+static struct notifier_block trace_module_nb = {
+        .notifier_call = trace_module_notify,
+        .priority = 0,
+};
+#endif /* CONFIG_MODULES */
+static __init int tracer_init_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void)
        if (IS_ERR(d_tracer))
                return 0;
-        init_tracer_debugfs(&global_trace, d_tracer);
+        init_tracer_tracefs(&global_trace, d_tracer);
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &global_trace, &tracing_thresh_fops);
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("saved_cmdlines_size", 0644, d_tracer,
                          NULL, &tracing_saved_cmdlines_size_fops);
+        trace_enum_init();
+        trace_create_enum_file(d_tracer);
+#ifdef CONFIG_MODULES
+        register_module_notifier(&trace_module_nb);
+#endif
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
        create_trace_options_dir(&global_trace);
+        /* If the tracer was started via cmdline, create options for it here */
+        if (global_trace.current_trace != &nop_trace)
+                update_tracer_options(&global_trace, global_trace.current_trace);
        return 0;
 }
@@ -6888,7 +7179,7 @@ void __init trace_init(void)
                        tracepoint_printk = 0;
        }
        tracer_alloc_buffers();
-        trace_event_init();     
+        trace_event_init();
 }
 __init static int clear_boot_tracer(void)
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void)
        return 0;
 }
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
 late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..d2612016de94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
 /**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
 * @name: the name chosen to select it on the available_tracers file
 * @init: called when one switches to this tracer (echo name > current_tracer)
 * @reset: called when one switches to another tracer
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
 #ifdef CONFIG_EVENT_TRACING
 void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
 #else
 static inline void __init trace_event_init(void) { }
+static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
 #endif
 extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
                __dynamic_array(        u32,    buf     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->fmt),
        FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
                __dynamic_array(        char,   buf     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->buf),
        FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
                __field(        const char *,   str     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->str),
        FILTER_OTHER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..7da1dfeb322e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
                return;
        if (!--dir->nr_events) {
-                debugfs_remove_recursive(dir->entry);
+                tracefs_remove_recursive(dir->entry);
                list_del(&dir->list);
                __put_system_dir(dir);
        }
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
                }
                spin_unlock(&dir->d_lock);
-                debugfs_remove_recursive(dir);
+                tracefs_remove_recursive(dir);
        }
        list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        } else
                __get_system(system);
-        dir->entry = debugfs_create_dir(name, parent);
+        dir->entry = tracefs_create_dir(name, parent);
        if (!dir->entry) {
                pr_warn("Failed to create system directory %s\n", name);
                __put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        dir->subsystem = system;
        file->system = dir;
-        entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+        entry = tracefs_create_file("filter", 0644, dir->entry, dir,
                                    &ftrace_subsystem_filter_fops);
        if (!entry) {
                kfree(system->filter);
                system->filter = NULL;
-                pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+                pr_warn("Could not create tracefs '%s/filter' entry\n", name);
        }
        trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
                d_events = parent;
        name = ftrace_event_name(call);
-        file->dir = debugfs_create_dir(name, d_events);
+        file->dir = tracefs_create_dir(name, d_events);
        if (!file->dir) {
-                pr_warn("Could not create debugfs '%s' directory\n", name);
+                pr_warn("Could not create tracefs '%s' directory\n", name);
                return -1;
        }
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
        return 0;
 }
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+        int rlen;
+        int elen;
+        /* Find the length of the enum value as a string */
+        elen = snprintf(ptr, 0, "%ld", map->enum_value);
+        /* Make sure there's enough room to replace the string with the value */
+        if (len < elen)
+                return NULL;
+        snprintf(ptr, elen + 1, "%ld", map->enum_value);
+        /* Get the rest of the string of ptr */
+        rlen = strlen(ptr + len);
+        memmove(ptr + elen, ptr + len, rlen);
+        /* Make sure we end the new string */
+        ptr[elen + rlen] = 0;
+        return ptr + elen;
+}
+static void update_event_printk(struct ftrace_event_call *call,
+                                struct trace_enum_map *map)
+{
+        char *ptr;
+        int quote = 0;
+        int len = strlen(map->enum_string);
+        for (ptr = call->print_fmt; *ptr; ptr++) {
+                if (*ptr == '\\') {
+                        ptr++;
+                        /* paranoid */
+                        if (!*ptr)
+                                break;
+                        continue;
+                }
+                if (*ptr == '"') {
+                        quote ^= 1;
+                        continue;
+                }
+                if (quote)
+                        continue;
+                if (isdigit(*ptr)) {
+                        /* skip numbers */
+                        do {
+                                ptr++;
+                                /* Check for alpha chars like ULL */
+                        } while (isalnum(*ptr));
+                        /*
+                         * A number must have some kind of delimiter after
+                         * it, and we can ignore that too.
+                         */
+                        continue;
+                }
+                if (isalpha(*ptr) || *ptr == '_') {
+                        if (strncmp(map->enum_string, ptr, len) == 0 &&
+                            !isalnum(ptr[len]) && ptr[len] != '_') {
+                                ptr = enum_replace(ptr, map, len);
+                                /* Hmm, enum string smaller than value */
+                                if (WARN_ON_ONCE(!ptr))
+                                        return;
+                                /*
+                                 * No need to decrement here, as enum_replace()
+                                 * returns the pointer to the character passed
+                                 * the enum, and two enums can not be placed
+                                 * back to back without something in between.
+                                 * We can skip that something in between.
+                                 */
+                                continue;
+                        }
+                skip_more:
+                        do {
+                                ptr++;
+                        } while (isalnum(*ptr) || *ptr == '_');
+                        /*
+                         * If what comes after this variable is a '.' or
+                         * '->' then we can continue to ignore that string.
+                         */
+                        if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+                                ptr += *ptr == '.' ? 1 : 2;
+                                goto skip_more;
+                        }
+                        /*
+                         * Once again, we can skip the delimiter that came
+                         * after the string.
+                         */
+                        continue;
+                }
+        }
+}
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+        struct ftrace_event_call *call, *p;
+        const char *last_system = NULL;
+        int last_i;
+        int i;
+        down_write(&trace_event_sem);
+        list_for_each_entry_safe(call, p, &ftrace_events, list) {
+                /* events are usually grouped together with systems */
+                if (!last_system || call->class->system != last_system) {
+                        last_i = 0;
+                        last_system = call->class->system;
+                }
+                for (i = last_i; i < len; i++) {
+                        if (call->class->system == map[i]->system) {
+                                /* Save the first system if need be */
+                                if (!last_i)
+                                        last_i = i;
+                                update_event_printk(call, map[i]);
+                        }
+                }
+        }
+        up_write(&trace_event_sem);
+}
 static struct ftrace_event_file *
 trace_create_new_event(struct ftrace_event_call *call,
                       struct trace_array *tr)
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
 static struct notifier_block trace_module_nb = {
        .notifier_call = trace_module_notify,
-        .priority = 0,
+        .priority = 1, /* higher than trace.c module notify */
 };
 #endif /* CONFIG_MODULES */
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; }
 /*
 * The top level array has already had its ftrace_event_file
 * descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
 * initialized, and we now have to create the files associated
 * to the events.
 */
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
        struct dentry *d_events;
        struct dentry *entry;
-        entry = debugfs_create_file("set_event", 0644, parent,
+        entry = tracefs_create_file("set_event", 0644, parent,
                                    tr, &ftrace_set_event_fops);
        if (!entry) {
-                pr_warn("Could not create debugfs 'set_event' entry\n");
+                pr_warn("Could not create tracefs 'set_event' entry\n");
                return -ENOMEM;
        }
-        d_events = debugfs_create_dir("events", parent);
+        d_events = tracefs_create_dir("events", parent);
        if (!d_events) {
-                pr_warn("Could not create debugfs 'events' directory\n");
+                pr_warn("Could not create tracefs 'events' directory\n");
                return -ENOMEM;
        }
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr)
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
-        debugfs_remove_recursive(tr->event_dir);
+        tracefs_remove_recursive(tr->event_dir);
        up_write(&trace_event_sem);
        tr->event_dir = NULL;
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void)
        if (IS_ERR(d_tracer))
                return 0;
-        entry = debugfs_create_file("available_events", 0444, d_tracer,
+        entry = tracefs_create_file("available_events", 0444, d_tracer,
                                    tr, &ftrace_avail_fops);
        if (!entry)
-                pr_warn("Could not create debugfs 'available_events' entry\n");
+                pr_warn("Could not create tracefs 'available_events' entry\n");
        if (trace_define_common_fields())
                pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = {			\
        },                                                              \
        .event.type             = etype,                                \
        .print_fmt              = print,                                \
-        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE,         \
 };                                                                      \
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..9cfea4c6d314 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
 *
 */
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
         * The curr_ret_stack is initialized to -1 and get increased
         * in this function.  So it can be less than -1 only if it was
         * filtered out via ftrace_graph_notrace_addr() which can be
-         * set from set_graph_notrace file in debugfs by user.
+         * set from set_graph_notrace file in tracefs by user.
         */
        if (current->curr_ret_stack < -1)
                return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
        .llseek         = generic_file_llseek,
 };
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
        return 0;
 }
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
 static __init int init_graph_trace(void)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..9ba3f43f580e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
 #define fetch_file_offset_string_size   NULL
 /* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
        /* Special types */
        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
                                        sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
                /* Parse fetch argument */
                ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
-                                                is_return, true);
+                                                is_return, true,
+                                                kprobes_fetch_type_table);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -1310,7 +1311,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
        return ret;
 }
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
        struct dentry *d_tracer;
@@ -1323,20 +1324,20 @@ static __init int init_kprobe_trace(void)
        if (IS_ERR(d_tracer))
                return 0;
-        entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+        entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
                                    NULL, &kprobe_events_ops);
        /* Event list interface */
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'kprobe_events' entry\n");
        /* Profile interface */
-        entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+        entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
                                    NULL, &kprobe_profile_ops);
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'kprobe_profile' entry\n");
        return 0;
 }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 /* Recursive argument parser */
 static int parse_probe_arg(char *arg, const struct fetch_type *t,
-                     struct fetch_param *f, bool is_return, bool is_kprobe)
+                     struct fetch_param *f, bool is_return, bool is_kprobe,
+                     const struct fetch_type *ftbl)
 {
-        const struct fetch_type *ftbl;
        unsigned long param;
        long offset;
        char *tmp;
        int ret = 0;
-        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-        BUG_ON(ftbl == NULL);
        switch (arg[0]) {
        case '$':
                ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                        dprm->fetch_size = get_fetch_size_function(t,
                                                        dprm->fetch, ftbl);
                        ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
-                                                        is_kprobe);
+                                                        is_kprobe, ftbl);
                        if (ret)
                                kfree(dprm);
                        else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
 /* String length checking wrapper */
 int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                struct probe_arg *parg, bool is_return, bool is_kprobe)
+                struct probe_arg *parg, bool is_return, bool is_kprobe,
+                const struct fetch_type *ftbl)
 {
-        const struct fetch_type *ftbl;
        const char *t;
        int ret;
-        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-        BUG_ON(ftbl == NULL);
        if (strlen(arg) > MAX_ARGSTR_LEN) {
                pr_info("Argument is too long.: %s\n",  arg);
                return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
        }
        parg->offset = *size;
        *size += parg->type->size;
-        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+                              is_kprobe, ftbl);
        if (ret >= 0 && t != NULL)
                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..ab283e146b70 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\
 #define FETCH_TYPE_STRING       0
 #define FETCH_TYPE_STRSIZE      1
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
 #ifdef CONFIG_KPROBE_EVENT
 struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
 }
 extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                   struct probe_arg *parg, bool is_return, bool is_kprobe);
+                   struct probe_arg *parg, bool is_return, bool is_kprobe,
+                   const struct fetch_type *ftbl);
 extern int traceprobe_conflict_field_name(const char *name,
                               struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include "trace_stat.h"
 #include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
 static void destroy_session(struct stat_session *session)
 {
-        debugfs_remove(session->file);
+        tracefs_remove(session->file);
        __reset_stat_session(session);
        mutex_destroy(&session->stat_mutex);
        kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
        if (IS_ERR(d_tracing))
                return 0;
-        stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+        stat_dir = tracefs_create_dir("trace_stat", d_tracing);
        if (!stat_dir)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'trace_stat' entry\n");
        return 0;
 }
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
        if (!stat_dir && tracing_stat_init())
                return -ENODEV;
-        session->file = debugfs_create_file(session->ts->name, 0644,
+        session->file = tracefs_create_file(session->ts->name, 0644,
                                            stat_dir,
                                            session, &tracing_stat_fops);
        if (!session->file)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..74865465e0b7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
 DEFINE_FETCH_file_offset(string_size)
 /* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
        /* Special types */
        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
                                        sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
                /* Parse fetch argument */
                ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
-                                                 is_return, false);
+                                                 is_return, false,
+                                                 uprobes_fetch_type_table);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b478c6..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
        /* see manage_workers() for details on the two manager mutexes */
        struct mutex            manager_arb;    /* manager arbitration */
+        struct worker           *manager;       /* L: purely informational */
        struct mutex            attach_mutex;   /* attach/detach exclusion */
        struct list_head        workers;        /* A: attached workers */
        struct completion       *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
 */
 struct workqueue_struct {
        struct list_head        pwqs;           /* WR: all pwqs of this wq */
-        struct list_head        list;           /* PL: list of all workqueues */
+        struct list_head        list;           /* PR: list of all workqueues */
        struct mutex            mutex;          /* protects this wq */
        int                     work_color;     /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
 #endif
        char                    name[WQ_NAME_LEN]; /* I: workqueue name */
+        /*
+         * Destruction of workqueue_struct is sched-RCU protected to allow
+         * walking the workqueues list without grabbing wq_pool_mutex.
+         * This is used to dump all workqueues from sysrq.
+         */
+        struct rcu_head         rcu;
        /* hot fields used during command issue, aligned to cacheline */
        unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);     /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues);           /* PL: list of all workqueues */
+static LIST_HEAD(workqueues);           /* PR: list of all workqueues */
 static bool workqueue_freezing;         /* PL: have wqs started freezing? */
 /* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
         */
        if (!mutex_trylock(&pool->manager_arb))
                return false;
+        pool->manager = worker;
        maybe_create_worker(pool);
+        pool->manager = NULL;
        mutex_unlock(&pool->manager_arb);
        return true;
 }
@@ -2303,6 +2314,7 @@ repeat:
 struct wq_barrier {
        struct work_struct      work;
        struct completion       done;
+        struct task_struct      *task;  /* purely informational */
 };
 static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        barr->task = current;
        /*
         * If @target is currently being executed, schedule the
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu     RO bool : whether the workqueue is per-cpu or unbound
- *  max_active  RW int  : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- *  id          RO int  : the associated pool ID
- *  nice        RW int  : nice value of the workers
- *  cpumask     RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
-        struct workqueue_struct         *wq;
-        struct device                   dev;
-};
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
-        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-        return wq_dev->wq;
-}
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-static ssize_t max_active_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-static ssize_t max_active_store(struct device *dev,
-                                struct device_attribute *attr, const char *buf,
-                                size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int val;
-        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-                return -EINVAL;
-        workqueue_set_max_active(wq, val);
-        return count;
-}
-static DEVICE_ATTR_RW(max_active);
-static struct attribute *wq_sysfs_attrs[] = {
-        &dev_attr_per_cpu.attr,
-        &dev_attr_max_active.attr,
-        NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-static ssize_t wq_pool_ids_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        const char *delim = "";
-        int node, written = 0;
-        rcu_read_lock_sched();
-        for_each_node(node) {
-                written += scnprintf(buf + written, PAGE_SIZE - written,
-                                     "%s%d:%d", delim, node,
-                                     unbound_pwq_by_node(wq, node)->pool->id);
-                delim = " ";
-        }
-        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-        rcu_read_unlock_sched();
-        return written;
-}
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-        struct workqueue_attrs *attrs;
-        attrs = alloc_workqueue_attrs(GFP_KERNEL);
-        if (!attrs)
-                return NULL;
-        mutex_lock(&wq->mutex);
-        copy_workqueue_attrs(attrs, wq->unbound_attrs);
-        mutex_unlock(&wq->mutex);
-        return attrs;
-}
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-                ret = apply_workqueue_attrs(wq, attrs);
-        else
-                ret = -EINVAL;
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static ssize_t wq_cpumask_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-                            cpumask_pr_args(wq->unbound_attrs->cpumask));
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-static ssize_t wq_cpumask_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        ret = cpumask_parse(buf, attrs->cpumask);
-        if (!ret)
-                ret = apply_workqueue_attrs(wq, attrs);
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%d\n",
-                            !wq->unbound_attrs->no_numa);
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int v, ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        ret = -EINVAL;
-        if (sscanf(buf, "%d", &v) == 1) {
-                attrs->no_numa = !v;
-                ret = apply_workqueue_attrs(wq, attrs);
-        }
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-        __ATTR_NULL,
-};
-static struct bus_type wq_subsys = {
-        .name                           = "workqueue",
-        .dev_groups                     = wq_sysfs_groups,
-};
-static int __init wq_sysfs_init(void)
-{
-        return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-static void wq_device_release(struct device *dev)
-{
-        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-        kfree(wq_dev);
-}
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
-        struct wq_device *wq_dev;
-        int ret;
-        /*
-         * Adjusting max_active or creating new pwqs by applyting
-         * attributes breaks ordering guarantee.  Disallow exposing ordered
-         * workqueues.
-         */
-        if (WARN_ON(wq->flags & __WQ_ORDERED))
-                return -EINVAL;
-        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-        if (!wq_dev)
-                return -ENOMEM;
-        wq_dev->wq = wq;
-        wq_dev->dev.bus = &wq_subsys;
-        wq_dev->dev.init_name = wq->name;
-        wq_dev->dev.release = wq_device_release;
-        /*
-         * unbound_attrs are created separately.  Suppress uevent until
-         * everything is ready.
-         */
-        dev_set_uevent_suppress(&wq_dev->dev, true);
-        ret = device_register(&wq_dev->dev);
-        if (ret) {
-                kfree(wq_dev);
-                wq->wq_dev = NULL;
-                return ret;
-        }
-        if (wq->flags & WQ_UNBOUND) {
-                struct device_attribute *attr;
-                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-                        ret = device_create_file(&wq_dev->dev, attr);
-                        if (ret) {
-                                device_unregister(&wq_dev->dev);
-                                wq->wq_dev = NULL;
-                                return ret;
-                        }
-                }
-        }
-        dev_set_uevent_suppress(&wq_dev->dev, false);
-        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-        return 0;
-}
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-        struct wq_device *wq_dev = wq->wq_dev;
-        if (!wq->wq_dev)
-                return;
-        wq->wq_dev = NULL;
-        device_unregister(&wq_dev->dev);
-}
-#else   /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)     { }
-#endif  /* CONFIG_SYSFS */
 /**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
        return 0;
 }
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+        struct workqueue_struct *wq =
+                container_of(rcu, struct workqueue_struct, rcu);
+        if (!(wq->flags & WQ_UNBOUND))
+                free_percpu(wq->cpu_pwqs);
+        else
+                free_workqueue_attrs(wq->unbound_attrs);
+        kfree(wq->rescuer);
+        kfree(wq);
+}
 static void rcu_free_pool(struct rcu_head *rcu)
 {
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
        /*
         * If we're the last pwq going away, @wq is already dead and no one
-         * is gonna access it anymore.  Free it.
+         * is gonna access it anymore.  Schedule RCU free.
         */
-        if (is_last) {
+        if (is_last)
-                free_workqueue_attrs(wq->unbound_attrs);
+                call_rcu_sched(&wq->rcu, rcu_free_wq);
-                kfree(wq);
-        }
 }
 /**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                pwq_adjust_max_active(pwq);
        mutex_unlock(&wq->mutex);
-        list_add(&wq->list, &workqueues);
+        list_add_tail_rcu(&wq->list, &workqueues);
        mutex_unlock(&wq_pool_mutex);
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
         * flushing is complete in case freeze races us.
         */
        mutex_lock(&wq_pool_mutex);
-        list_del_init(&wq->list);
+        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);
        workqueue_sysfs_unregister(wq);
-        if (wq->rescuer) {
+        if (wq->rescuer)
                kthread_stop(wq->rescuer->task);
-                kfree(wq->rescuer);
-                wq->rescuer = NULL;
-        }
        if (!(wq->flags & WQ_UNBOUND)) {
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
-                 * free the pwqs and wq.
+                 * schedule RCU free.
                 */
-                free_percpu(wq->cpu_pwqs);
+                call_rcu_sched(&wq->rcu, rcu_free_wq);
-                kfree(wq);
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
        }
 }
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+        if (pool->node != NUMA_NO_NODE)
+                pr_cont(" node=%d", pool->node);
+        pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+        if (work->func == wq_barrier_func) {
+                struct wq_barrier *barr;
+                barr = container_of(work, struct wq_barrier, work);
+                pr_cont("%s BAR(%d)", comma ? "," : "",
+                        task_pid_nr(barr->task));
+        } else {
+                pr_cont("%s %pf", comma ? "," : "", work->func);
+        }
+}
+static void show_pwq(struct pool_workqueue *pwq)
+{
+        struct worker_pool *pool = pwq->pool;
+        struct work_struct *work;
+        struct worker *worker;
+        bool has_in_flight = false, has_pending = false;
+        int bkt;
+        pr_info("  pwq %d:", pool->id);
+        pr_cont_pool_info(pool);
+        pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                if (worker->current_pwq == pwq) {
+                        has_in_flight = true;
+                        break;
+                }
+        }
+        if (has_in_flight) {
+                bool comma = false;
+                pr_info("    in-flight:");
+                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                        if (worker->current_pwq != pwq)
+                                continue;
+                        pr_cont("%s %d%s:%pf", comma ? "," : "",
+                                task_pid_nr(worker->task),
+                                worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+                                worker->current_func);
+                        list_for_each_entry(work, &worker->scheduled, entry)
+                                pr_cont_work(false, work);
+                        comma = true;
+                }
+                pr_cont("\n");
+        }
+        list_for_each_entry(work, &pool->worklist, entry) {
+                if (get_work_pwq(work) == pwq) {
+                        has_pending = true;
+                        break;
+                }
+        }
+        if (has_pending) {
+                bool comma = false;
+                pr_info("    pending:");
+                list_for_each_entry(work, &pool->worklist, entry) {
+                        if (get_work_pwq(work) != pwq)
+                                continue;
+                        pr_cont_work(comma, work);
+                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+                }
+                pr_cont("\n");
+        }
+        if (!list_empty(&pwq->delayed_works)) {
+                bool comma = false;
+                pr_info("    delayed:");
+                list_for_each_entry(work, &pwq->delayed_works, entry) {
+                        pr_cont_work(comma, work);
+                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+                }
+                pr_cont("\n");
+        }
+}
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+        struct workqueue_struct *wq;
+        struct worker_pool *pool;
+        unsigned long flags;
+        int pi;
+        rcu_read_lock_sched();
+        pr_info("Showing busy workqueues and worker pools:\n");
+        list_for_each_entry_rcu(wq, &workqueues, list) {
+                struct pool_workqueue *pwq;
+                bool idle = true;
+                for_each_pwq(pwq, wq) {
+                        if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+                                idle = false;
+                                break;
+                        }
+                }
+                if (idle)
+                        continue;
+                pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+                for_each_pwq(pwq, wq) {
+                        spin_lock_irqsave(&pwq->pool->lock, flags);
+                        if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+                                show_pwq(pwq);
+                        spin_unlock_irqrestore(&pwq->pool->lock, flags);
+                }
+        }
+        for_each_pool(pool, pi) {
+                struct worker *worker;
+                bool first = true;
+                spin_lock_irqsave(&pool->lock, flags);
+                if (pool->nr_workers == pool->nr_idle)
+                        goto next_pool;
+                pr_info("pool %d:", pool->id);
+                pr_cont_pool_info(pool);
+                pr_cont(" workers=%d", pool->nr_workers);
+                if (pool->manager)
+                        pr_cont(" manager: %d",
+                                task_pid_nr(pool->manager->task));
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        pr_cont(" %s%d", first ? "idle: " : "",
+                                task_pid_nr(worker->task));
+                        first = false;
+                }
+                pr_cont("\n");
+        next_pool:
+                spin_unlock_irqrestore(&pool->lock, flags);
+        }
+        rcu_read_unlock_sched();
+}
 /*
 * CPU hotplug.
 *
@@ -4834,6 +4698,323 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu     RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active  RW int  : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id          RO int  : the associated pool ID
+ *  nice        RW int  : nice value of the workers
+ *  cpumask     RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+        struct workqueue_struct         *wq;
+        struct device                   dev;
+};
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        return wq_dev->wq;
+}
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+static ssize_t max_active_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+static ssize_t max_active_store(struct device *dev,
+                                struct device_attribute *attr, const char *buf,
+                                size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int val;
+        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+                return -EINVAL;
+        workqueue_set_max_active(wq, val);
+        return count;
+}
+static DEVICE_ATTR_RW(max_active);
+static struct attribute *wq_sysfs_attrs[] = {
+        &dev_attr_per_cpu.attr,
+        &dev_attr_max_active.attr,
+        NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+static ssize_t wq_pool_ids_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        const char *delim = "";
+        int node, written = 0;
+        rcu_read_lock_sched();
+        for_each_node(node) {
+                written += scnprintf(buf + written, PAGE_SIZE - written,
+                                     "%s%d:%d", delim, node,
+                                     unbound_pwq_by_node(wq, node)->pool->id);
+                delim = " ";
+        }
+        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+        rcu_read_unlock_sched();
+        return written;
+}
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+        struct workqueue_attrs *attrs;
+        attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!attrs)
+                return NULL;
+        mutex_lock(&wq->mutex);
+        copy_workqueue_attrs(attrs, wq->unbound_attrs);
+        mutex_unlock(&wq->mutex);
+        return attrs;
+}
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+                ret = apply_workqueue_attrs(wq, attrs);
+        else
+                ret = -EINVAL;
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_cpumask_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                            cpumask_pr_args(wq->unbound_attrs->cpumask));
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+static ssize_t wq_cpumask_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = cpumask_parse(buf, attrs->cpumask);
+        if (!ret)
+                ret = apply_workqueue_attrs(wq, attrs);
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n",
+                            !wq->unbound_attrs->no_numa);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int v, ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = -EINVAL;
+        if (sscanf(buf, "%d", &v) == 1) {
+                attrs->no_numa = !v;
+                ret = apply_workqueue_attrs(wq, attrs);
+        }
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+        __ATTR_NULL,
+};
+static struct bus_type wq_subsys = {
+        .name                           = "workqueue",
+        .dev_groups                     = wq_sysfs_groups,
+};
+static int __init wq_sysfs_init(void)
+{
+        return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+static void wq_device_release(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        kfree(wq_dev);
+}
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev;
+        int ret;
+        /*
+         * Adjusting max_active or creating new pwqs by applyting
+         * attributes breaks ordering guarantee.  Disallow exposing ordered
+         * workqueues.
+         */
+        if (WARN_ON(wq->flags & __WQ_ORDERED))
+                return -EINVAL;
+        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+        if (!wq_dev)
+                return -ENOMEM;
+        wq_dev->wq = wq;
+        wq_dev->dev.bus = &wq_subsys;
+        wq_dev->dev.init_name = wq->name;
+        wq_dev->dev.release = wq_device_release;
+        /*
+         * unbound_attrs are created separately.  Suppress uevent until
+         * everything is ready.
+         */
+        dev_set_uevent_suppress(&wq_dev->dev, true);
+        ret = device_register(&wq_dev->dev);
+        if (ret) {
+                kfree(wq_dev);
+                wq->wq_dev = NULL;
+                return ret;
+        }
+        if (wq->flags & WQ_UNBOUND) {
+                struct device_attribute *attr;
+                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+                        ret = device_create_file(&wq_dev->dev, attr);
+                        if (ret) {
+                                device_unregister(&wq_dev->dev);
+                                wq->wq_dev = NULL;
+                                return ret;
+                        }
+                }
+        }
+        dev_set_uevent_suppress(&wq_dev->dev, false);
+        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+        return 0;
+}
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev = wq->wq_dev;
+        if (!wq->wq_dev)
+                return;
+        wq->wq_dev = NULL;
+        device_unregister(&wq_dev->dev);
+}
+#else   /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)     { }
+#endif  /* CONFIG_SYSFS */
 static void __init wq_numa_init(void)
 {
        cpumask_var_t *tbl;