15 files changed, 250 insertions, 85 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1d1fe9361d29..fc7f4748d34a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -548,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-                if (cp == root_cs)
-                        continue;
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
@@ -873,7 +870,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
-                if (cpumask_empty(new_cpus))
+                if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
                        cpumask_copy(new_cpus, parent->effective_cpus);
                /* Skip the whole subtree if the cpumask remains the same. */
@@ -1129,7 +1126,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
-                if (nodes_empty(*new_mems))
+                if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;
                /* Skip the whole subtree if the nodemask remains the same. */
@@ -1979,7 +1976,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
+        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        spin_unlock_irq(&callback_lock);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f04daabfd1cf..2fabc0627165 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3591,7 +3591,7 @@ static void put_event(struct perf_event *event)
        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
-        mutex_unlock(&ctx->mutex);
+        perf_event_ctx_unlock(event, ctx);
        _free_event(event);
 }
@@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
+        int rctx;
+        rctx = perf_swevent_get_recursion_context();
+        /*
+         * If we 'fail' here, that's OK, it means recursion is already disabled
+         * and we won't recurse 'further'.
+         */
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry)
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }
+        if (rctx >= 0)
+                perf_swevent_put_recursion_context(rctx);
 }
 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 196a06fbc122..886d09e691d5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1474,8 +1474,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
         * otherwise we'll have trouble later trying to figure out
         * which interrupt is which (messes up the interrupt freeing
         * logic etc).
+         *
+         * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
+         * it cannot be set along with IRQF_NO_SUSPEND.
         */
-        if ((irqflags & IRQF_SHARED) && !dev_id)
+        if (((irqflags & IRQF_SHARED) && !dev_id) ||
+            (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
+            ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
                return -EINVAL;
        desc = irq_to_desc(irq);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 3ca532592704..5204a6d1b985 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
        if (action->flags & IRQF_NO_SUSPEND)
                desc->no_suspend_depth++;
+        else if (action->flags & IRQF_COND_SUSPEND)
+                desc->cond_suspend_depth++;
        WARN_ON_ONCE(desc->no_suspend_depth &&
-                     desc->no_suspend_depth != desc->nr_actions);
+                     (desc->no_suspend_depth +
+                        desc->cond_suspend_depth) != desc->nr_actions);
 }
 /*
@@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
        if (action->flags & IRQF_NO_SUSPEND)
                desc->no_suspend_depth--;
+        else if (action->flags & IRQF_COND_SUSPEND)
+                desc->cond_suspend_depth--;
 }
 static bool suspend_device_irq(struct irq_desc *desc, int irq)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 782172f073c5..3f9f1d6b4c2e 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj)
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
+        struct module *mod;
        if (!klp_is_module(obj))
                return;
        mutex_lock(&module_mutex);
        /*
-         * We don't need to take a reference on the module here because we have
+         * We do not want to block removal of patched modules and therefore
-         * the klp_mutex, which is also taken by the module notifier.  This
+         * we do not take a reference here. The patches are removed by
-         * prevents any module from unloading until we release the klp_mutex.
+         * a going module handler instead.
+         */
+        mod = find_module(obj->name);
+        /*
+         * Do not mess work of the module coming and going notifiers.
+         * Note that the patch might still be needed before the going handler
+         * is called. Module functions can be called even in the GOING state
+         * until mod->exit() finishes. This is especially important for
+         * patches that modify semantic of the functions.
         */
-        obj->mod = find_module(obj->name);
+        if (mod && mod->klp_alive)
+                obj->mod = mod;
        mutex_unlock(&module_mutex);
 }
@@ -248,11 +260,12 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
        /* first, check if it's an exported symbol */
        preempt_disable();
        sym = find_symbol(name, NULL, NULL, true, true);
-        preempt_enable();
        if (sym) {
                *addr = sym->value;
+                preempt_enable();
                return 0;
        }
+        preempt_enable();
        /* otherwise check if it's in another .o within the patch module */
        return klp_find_object_symbol(pmod->name, name, addr);
@@ -766,6 +779,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
                return -EINVAL;
        obj->state = KLP_DISABLED;
+        obj->mod = NULL;
        klp_find_object_module(obj);
@@ -960,6 +974,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
        mutex_lock(&klp_mutex);
+        /*
+         * Each module has to know that the notifier has been called.
+         * We never know what module will get patched by a new patch.
+         */
+        if (action == MODULE_STATE_COMING)
+                mod->klp_alive = true;
+        else /* MODULE_STATE_GOING */
+                mod->klp_alive = false;
        list_for_each_entry(patch, &klp_patches, list) {
                for (obj = patch->objs; obj->funcs; obj++) {
                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
        if (!new_class->name)
                return 0;
-        list_for_each_entry(class, &all_lock_classes, lock_entry) {
+        list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
                if (new_class->key - new_class->subclass == class->key)
                        return class->name_version;
                if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        hash_head = classhashentry(key);
        /*
-         * We can walk the hash lockfree, because the hash only
+         * We do an RCU walk of the hash, see lockdep_free_key_range().
-         * grows, and we are careful when adding entries to the end:
         */
-        list_for_each_entry(class, hash_head, hash_entry) {
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return NULL;
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key) {
                        /*
                         * Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
-        unsigned long flags;
+        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
-        raw_local_irq_save(flags);
        if (!graph_lock()) {
-                raw_local_irq_restore(flags);
                return NULL;
        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
         */
-        list_for_each_entry(class, hash_head, hash_entry)
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key)
                        goto out_unlock_set;
+        }
        /*
         * Allocate a new key from the static array, and add it to
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
                if (!debug_locks_off_graph_unlock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
-                raw_local_irq_restore(flags);
                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
                dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        if (verbose(class)) {
                graph_unlock();
-                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                printk("\n");
                dump_stack();
-                raw_local_irq_save(flags);
                if (!graph_lock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
        }
 out_unlock_set:
        graph_unlock();
-        raw_local_irq_restore(flags);
 out_set_class_cache:
        if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        entry->distance = distance;
        entry->trace = *trace;
        /*
-         * Since we never remove from the dependency list, the list can
+         * Both allocation and removal are done under the graph lock; but
-         * be walked lockless by other CPUs, it's only allocation
+         * iteration is under RCU-sched; see look_up_lock_class() and
-         * that must be protected by the spinlock. But this also means
+         * lockdep_free_key_range().
-         * we must make new entries visible only once writes to the
-         * entry become visible - hence the RCU op:
         */
        list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
                else
                        head = &lock->class->locks_before;
-                list_for_each_entry(entry, head, entry) {
+                DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+                list_for_each_entry_rcu(entry, head, entry) {
                        if (!lock_accessed(entry)) {
                                unsigned int cq_depth;
                                mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         * We can walk it lock-free, because entries only get added
         * to the hash:
         */
-        list_for_each_entry(chain, hash_head, entry) {
+        list_for_each_entry_rcu(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        if (unlikely(!debug_locks))
                return;
-        if (subclass)
+        if (subclass) {
+                unsigned long flags;
+                if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+                        return;
+                raw_local_irq_save(flags);
+                current->lockdep_recursion = 1;
                register_lock_class(lock, subclass, 1);
+                current->lockdep_recursion = 0;
+                raw_local_irq_restore(flags);
+        }
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
        return addr >= start && addr < start + size;
 }
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
 void lockdep_free_key_range(void *start, unsigned long size)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        if (within(class->key, start, size))
                                zap_class(class);
                        else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
        if (locked)
                graph_unlock();
        raw_local_irq_restore(flags);
+        /*
+         * Wait for any possible iterators from look_up_lock_class() to pass
+         * before continuing to free the memory they refer to.
+         *
+         * sync_sched() is sufficient because the read-side is IRQ disable.
+         */
+        synchronize_sched();
+        /*
+         * XXX at this point we could return the resources to the pool;
+         * instead we leak them. We would need to change to bitmap allocators
+         * instead of the linear allocators we have now.
+         */
 }
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        int match = 0;
                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/module.c b/kernel/module.c
index b34813f725e9..99fdf94efce8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,7 +56,6 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
-#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
        vfree(module_region);
-        kasan_module_free(module_region);
 }
 void __weak module_arch_cleanup(struct module *mod)
@@ -1867,7 +1865,7 @@ static void free_module(struct module *mod)
        kfree(mod->args);
        percpu_modfree(mod);
-        /* Free lock-classes: */
+        /* Free lock-classes; relies on the preceding sync_rcu(). */
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
@@ -2313,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
        info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
        mod->core_size += strtab_size;
+        mod->core_size = debug_align(mod->core_size);
        /* Put string table section at end of init part of module. */
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
                                         info->index.str) | INIT_OFFSET_MASK;
+        mod->init_size = debug_align(mod->init_size);
        pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
 }
@@ -3349,9 +3349,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
-        /* Free lock-classes: */
-        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* we can't deallocate the module until we clear memory protection */
        unset_module_init_ro_nx(mod);
        unset_module_core_ro_nx(mod);
@@ -3375,6 +3372,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
+        /* Free lock-classes; relies on the preceding sync_rcu() */
+        lockdep_free_key_range(mod->module_core, mod->core_size);
        module_deallocate(mod, info);
 free_copy:
        free_copy(info);
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index cbd69d842341..2ca4a8b5fe57 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -3,7 +3,7 @@
 struct console_cmdline
 {
-        char    name[8];                        /* Name of the driver       */
+        char    name[16];                       /* Name of the driver       */
        int     index;                          /* Minor dev. to use        */
        char    *options;                       /* Options for the driver   */
 #ifdef CONFIG_A11Y_BRAILLE_CONSOLE
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 01cfd69c54c6..bb0635bd74f2 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2464,6 +2464,7 @@ void register_console(struct console *newcon)
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && c->name[0];
             i++, c++) {
+                BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                if (strcmp(c->name, newcon->name) != 0)
                        continue;
                if (newcon->index >= 0 &&
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..62671f53202a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3034,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        } else {
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
+                if (rt_prio(oldprio))
+                        p->rt.timeout = 0;
                p->sched_class = &fair_sched_class;
        }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..bcfe32088b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
        /*
         * If there were no record hinting faults then either the task is
         * completely idle or all activity is areas that are not of interest
-         * to automatic numa balancing. Scan slower
+         * to automatic numa balancing. Related to that, if there were failed
+         * migration then it implies we are migrating too quickly or the local
+         * node is overloaded. In either case, scan slower
         */
-        if (local + shared == 0) {
+        if (local + shared == 0 || p->numa_faults_locality[2]) {
                p->numa_scan_period = min(p->numa_scan_period_max,
                        p->numa_scan_period << 1);
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        if (migrated)
                p->numa_pages_migrated += pages;
+        if (flags & TNF_MIGRATE_FAIL)
+                p->numa_faults_locality[2] += pages;
        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 94b2d7b88a27..80014a178342 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -82,6 +82,7 @@ static void cpuidle_idle_call(void)
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
        int next_state, entered_state;
        unsigned int broadcast;
+        bool reflect;
        /*
         * Check if the idle task must be rescheduled. If it is the
@@ -105,6 +106,9 @@ static void cpuidle_idle_call(void)
         */
        rcu_idle_enter();
+        if (cpuidle_not_available(drv, dev))
+                goto use_default;
        /*
         * Suspend-to-idle ("freeze") is a system state in which all user space
         * has been frozen, all I/O devices have been suspended and the only
@@ -115,30 +119,24 @@ static void cpuidle_idle_call(void)
         * until a proper wakeup interrupt happens.
         */
        if (idle_should_freeze()) {
-                cpuidle_enter_freeze();
+                entered_state = cpuidle_enter_freeze(drv, dev);
-                local_irq_enable();
+                if (entered_state >= 0) {
-                goto exit_idle;
+                        local_irq_enable();
-        }
+                        goto exit_idle;
+                }
-        /*
+                reflect = false;
-         * Ask the cpuidle framework to choose a convenient idle state.
+                next_state = cpuidle_find_deepest_state(drv, dev);
-         * Fall back to the default arch idle method on errors.
+        } else {
-         */
+                reflect = true;
-        next_state = cpuidle_select(drv, dev);
-        if (next_state < 0) {
-use_default:
                /*
-                 * We can't use the cpuidle framework, let's use the default
+                 * Ask the cpuidle framework to choose a convenient idle state.
-                 * idle routine.
                 */
-                if (current_clr_polling_and_test())
+                next_state = cpuidle_select(drv, dev);
-                        local_irq_enable();
-                else
-                        arch_cpu_idle();
-                goto exit_idle;
        }
+        /* Fall back to the default arch idle method on errors. */
+        if (next_state < 0)
+                goto use_default;
        /*
         * The idle task must be scheduled, it is pointless to
@@ -183,7 +181,8 @@ use_default:
        /*
         * Give the governor an opportunity to reflect on the outcome
         */
-        cpuidle_reflect(dev, entered_state);
+        if (reflect)
+                cpuidle_reflect(dev, entered_state);
 exit_idle:
        __current_set_polling();
@@ -196,6 +195,19 @@ exit_idle:
        rcu_idle_exit();
        start_critical_timings();
+        return;
+use_default:
+        /*
+         * We can't use the cpuidle framework, let's use the default
+         * idle routine.
+         */
+        if (current_clr_polling_and_test())
+                local_irq_enable();
+        else
+                arch_cpu_idle();
+        goto exit_idle;
 }
 /*
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 */
 static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 {
+        int bc_moved;
        /*
         * We try to cancel the timer first. If the callback is on
         * flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
         * restart the timer because we are in the callback, but we
         * can set the expiry time and let the callback return
         * HRTIMER_RESTART.
+         *
+         * Since we are in the idle loop at this point and because
+         * hrtimer_{start/cancel} functions call into tracing,
+         * calls to these functions must be bound within RCU_NONIDLE.
         */
-        if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+        RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-                hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+                !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+                        0);
+        if (bc_moved) {
                /* Bind the "device" to the cpu */
                bc->bound_on = smp_processor_id();
        } else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45e5cb143d17..4f228024055b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1059,6 +1059,12 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_graph_active;
+#else
+# define ftrace_graph_active 0
+#endif
 #ifdef CONFIG_DYNAMIC_FTRACE
 static struct ftrace_ops *removed_ops;
@@ -2041,8 +2047,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
                if (!ftrace_rec_count(rec))
                        rec->flags = 0;
                else
-                        /* Just disable the record (keep REGS state) */
+                        /*
-                        rec->flags &= ~FTRACE_FL_ENABLED;
+                         * Just disable the record, but keep the ops TRAMP
+                         * and REGS states. The _EN flags must be disabled though.
+                         */
+                        rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
+                                        FTRACE_FL_REGS_EN);
        }
        return FTRACE_UPDATE_MAKE_NOP;
@@ -2688,24 +2698,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 static void ftrace_startup_sysctl(void)
 {
+        int command;
        if (unlikely(ftrace_disabled))
                return;
        /* Force update next time */
        saved_ftrace_func = NULL;
        /* ftrace_start_up is true if we want ftrace running */
-        if (ftrace_start_up)
+        if (ftrace_start_up) {
-                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+                command = FTRACE_UPDATE_CALLS;
+                if (ftrace_graph_active)
+                        command |= FTRACE_START_FUNC_RET;
+                ftrace_startup_enable(command);
+        }
 }
 static void ftrace_shutdown_sysctl(void)
 {
+        int command;
        if (unlikely(ftrace_disabled))
                return;
        /* ftrace_start_up is true if ftrace is running */
-        if (ftrace_start_up)
+        if (ftrace_start_up) {
-                ftrace_run_update_code(FTRACE_DISABLE_CALLS);
+                command = FTRACE_DISABLE_CALLS;
+                if (ftrace_graph_active)
+                        command |= FTRACE_STOP_FUNC_RET;
+                ftrace_run_update_code(command);
+        }
 }
 static cycle_t          ftrace_update_time;
@@ -5558,12 +5580,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
        if (ftrace_enabled) {
-                ftrace_startup_sysctl();
                /* we are starting ftrace again */
                if (ftrace_ops_list != &ftrace_list_end)
                        update_ftrace_function();
+                ftrace_startup_sysctl();
        } else {
                /* stopping ftrace calls (just send to ftrace_stub) */
                ftrace_trace_function = ftrace_stub;
@@ -5590,8 +5612,6 @@ static struct ftrace_ops graph_ops = {
        ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
 };
-static int ftrace_graph_active;
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
        return 0;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f28849394791..41ff75b478c6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2728,19 +2728,57 @@ bool flush_work(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(flush_work);
+struct cwt_wait {
+        wait_queue_t            wait;
+        struct work_struct      *work;
+};
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+        if (cwait->work != key)
+                return 0;
+        return autoremove_wake_function(wait, mode, sync, key);
+}
 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+        static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
        unsigned long flags;
        int ret;
        do {
                ret = try_to_grab_pending(work, is_dwork, &flags);
                /*
-                 * If someone else is canceling, wait for the same event it
+                 * If someone else is already canceling, wait for it to
-                 * would be waiting for before retrying.
+                 * finish.  flush_work() doesn't work for PREEMPT_NONE
+                 * because we may get scheduled between @work's completion
+                 * and the other canceling task resuming and clearing
+                 * CANCELING - flush_work() will return false immediately
+                 * as @work is no longer busy, try_to_grab_pending() will
+                 * return -ENOENT as @work is still being canceled and the
+                 * other canceling task won't be able to clear CANCELING as
+                 * we're hogging the CPU.
+                 *
+                 * Let's wait for completion using a waitqueue.  As this
+                 * may lead to the thundering herd problem, use a custom
+                 * wake function which matches @work along with exclusive
+                 * wait and wakeup.
                 */
-                if (unlikely(ret == -ENOENT))
+                if (unlikely(ret == -ENOENT)) {
-                        flush_work(work);
+                        struct cwt_wait cwait;
+                        init_wait(&cwait.wait);
+                        cwait.wait.func = cwt_wakefn;
+                        cwait.work = work;
+                        prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+                                                  TASK_UNINTERRUPTIBLE);
+                        if (work_is_canceling(work))
+                                schedule();
+                        finish_wait(&cancel_waitq, &cwait.wait);
+                }
        } while (unlikely(ret < 0));
        /* tell other tasks trying to grab @work to back off */
@@ -2749,6 +2787,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
        flush_work(work);
        clear_work_data(work);
+        /*
+         * Paired with prepare_to_wait() above so that either
+         * waitqueue_active() is visible here or !work_is_canceling() is
+         * visible there.
+         */
+        smp_mb();
+        if (waitqueue_active(&cancel_waitq))
+                __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
        return ret;
 }