9 files changed, 220 insertions, 177 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
        if (!(cgrp->root->subsys_mask & (1 << ss->id)))
                return NULL;
+        /*
+         * This function is used while updating css associations and thus
+         * can't test the csses directly.  Use ->child_subsys_mask.
+         */
        while (cgroup_parent(cgrp) &&
               !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
                cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
        return cgroup_css(cgrp, ss);
 }
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+                                             struct cgroup_subsys *ss)
+{
+        struct cgroup_subsys_state *css;
+        rcu_read_lock();
+        do {
+                css = cgroup_css(cgrp, ss);
+                if (css && css_tryget_online(css))
+                        goto out_unlock;
+                cgrp = cgroup_parent(cgrp);
+        } while (cgrp);
+        css = init_css_set.subsys[ss->id];
+        css_get(css);
+out_unlock:
+        rcu_read_unlock();
+        return css;
+}
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
 }
 /**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
 * @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
- * This function determines which subsystems need to be enabled given the
+ * This function calculates which subsystems need to be enabled if
- * current @cgrp->subtree_control and records it in
+ * @subtree_control is to be applied to @cgrp.  The returned mask is always
- * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
+ * a superset of @subtree_control and follows the usual hierarchy rules.
- * @cgrp->subtree_control and follows the usual hierarchy rules.
 */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+                                                  unsigned int subtree_control)
 {
        struct cgroup *parent = cgroup_parent(cgrp);
-        unsigned int cur_ss_mask = cgrp->subtree_control;
+        unsigned int cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;
        lockdep_assert_held(&cgroup_mutex);
-        if (!cgroup_on_dfl(cgrp)) {
+        if (!cgroup_on_dfl(cgrp))
-                cgrp->child_subsys_mask = cur_ss_mask;
+                return cur_ss_mask;
-                return;
-        }
        while (true) {
                unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
                cur_ss_mask = new_ss_mask;
        }
-        cgrp->child_subsys_mask = cur_ss_mask;
+        return cur_ss_mask;
+}
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+        cgrp->child_subsys_mask =
+                cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
 }
 /**
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            loff_t off)
 {
        unsigned int enable = 0, disable = 0;
-        unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+        unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                ret = -ENOENT;
                                goto out_unlock;
                        }
-                        /*
-                         * @ss is already enabled through dependency and
-                         * we'll just make it visible.  Skip draining.
-                         */
-                        if (cgrp->child_subsys_mask & (1 << ssid))
-                                continue;
-                        /*
-                         * Because css offlining is asynchronous, userland
-                         * might try to re-enable the same controller while
-                         * the previous instance is still around.  In such
-                         * cases, wait till it's gone using offline_waitq.
-                         */
-                        cgroup_for_each_live_child(child, cgrp) {
-                                DEFINE_WAIT(wait);
-                                if (!cgroup_css(child, ss))
-                                        continue;
-                                cgroup_get(child);
-                                prepare_to_wait(&child->offline_waitq, &wait,
-                                                TASK_UNINTERRUPTIBLE);
-                                cgroup_kn_unlock(of->kn);
-                                schedule();
-                                finish_wait(&child->offline_waitq, &wait);
-                                cgroup_put(child);
-                                return restart_syscall();
-                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
         * subsystems than specified may need to be enabled or disabled
         * depending on subsystem dependencies.
         */
-        cgrp->subtree_control |= enable;
+        old_sc = cgrp->subtree_control;
-        cgrp->subtree_control &= ~disable;
+        old_ss = cgrp->child_subsys_mask;
+        new_sc = (old_sc | enable) & ~disable;
+        new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
-        old_ctrl = cgrp->child_subsys_mask;
+        css_enable = ~old_ss & new_ss;
-        cgroup_refresh_child_subsys_mask(cgrp);
+        css_disable = old_ss & ~new_ss;
-        new_ctrl = cgrp->child_subsys_mask;
-        css_enable = ~old_ctrl & new_ctrl;
-        css_disable = old_ctrl & ~new_ctrl;
        enable |= css_enable;
        disable |= css_disable;
        /*
+         * Because css offlining is asynchronous, userland might try to
+         * re-enable the same controller while the previous instance is
+         * still around.  In such cases, wait till it's gone using
+         * offline_waitq.
+         */
+        for_each_subsys(ss, ssid) {
+                if (!(css_enable & (1 << ssid)))
+                        continue;
+                cgroup_for_each_live_child(child, cgrp) {
+                        DEFINE_WAIT(wait);
+                        if (!cgroup_css(child, ss))
+                                continue;
+                        cgroup_get(child);
+                        prepare_to_wait(&child->offline_waitq, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        cgroup_kn_unlock(of->kn);
+                        schedule();
+                        finish_wait(&child->offline_waitq, &wait);
+                        cgroup_put(child);
+                        return restart_syscall();
+                }
+        }
+        cgrp->subtree_control = new_sc;
+        cgrp->child_subsys_mask = new_ss;
+        /*
         * Create new csses or make the existing ones visible.  A css is
         * created invisible if it's being implicitly enabled through
         * dependency.  An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                }
        }
+        /*
+         * The effective csses of all the descendants (excluding @cgrp) may
+         * have changed.  Subsystems can optionally subscribe to this event
+         * by implementing ->css_e_css_changed() which is invoked if any of
+         * the effective csses seen from the css's cgroup may have changed.
+         */
+        for_each_subsys(ss, ssid) {
+                struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+                struct cgroup_subsys_state *css;
+                if (!ss->css_e_css_changed || !this_css)
+                        continue;
+                css_for_each_descendant_pre(css, this_css)
+                        if (css != this_css)
+                                ss->css_e_css_changed(css);
+        }
        kernfs_activate(cgrp->kn);
        ret = 0;
 out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
        return ret ?: nbytes;
 err_undo_css:
-        cgrp->subtree_control &= ~enable;
+        cgrp->subtree_control = old_sc;
-        cgrp->subtree_control |= disable;
+        cgrp->child_subsys_mask = old_ss;
-        cgroup_refresh_child_subsys_mask(cgrp);
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
        if (ss) {
                /* css release path */
                cgroup_idr_remove(&ss->css_idr, css->id);
+                if (ss->css_released)
+                        ss->css_released(css);
        } else {
                /* cgroup release path */
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9d0ad7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 /*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
- * and callback_mutex.  The latter may nest inside the former.  We also
+ * callback_lock. We also require taking task_lock() when dereferencing a
- * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * See "The task_lock() exception", at the end of this comment.
+ * comment.
 *
- * A task must hold both mutexes to modify cpusets.  If a task holds
+ * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
+ * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * Once it is ready to make the changes, it takes callback_lock, blocking
 * everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
 */
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 /*
 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 /*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
 */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                        continue;
                rcu_read_unlock();
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                cpumask_copy(cp->effective_cpus, new_cpus);
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                return retval;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /* use trialcs->cpus_allowed as a temp variable */
        update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                        continue;
                rcu_read_unlock();
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
- * Call with cpuset_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                goto done;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
        count = seq_get_buf(sf, &buf);
        s = buf;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        switch (type) {
        case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
                seq_commit(sf, -1);
        }
 out_unlock:
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        return ret;
 }
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        cpuset_inc();
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        if (cgroup_on_dfl(cs->css.cgroup)) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        }
        rcu_read_unlock();
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
        return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
        mutex_lock(&cpuset_mutex);
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        if (cgroup_on_dfl(root_css->cgroup)) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
 }
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 {
        bool is_empty;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;
-        mutex_lock(&callback_mutex);
+        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irq(&callback_lock);
        if (cpus_updated)
                update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
-                mutex_lock(&callback_mutex);
+                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
-                mutex_unlock(&callback_mutex);
+                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-        mutex_lock(&callback_mutex);
+        unsigned long flags;
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
 }
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
        nodemask_t mask;
+        unsigned long flags;
-        mutex_lock(&callback_mutex);
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
        return mask;
 }
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 /*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
- * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 }
 /**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * flag, yes.
 * Otherwise, no.
 *
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
 * The __GFP_THISNODE placement logic is really handled elsewhere,
 * by forcibly using a zonelist starting at a specified node, and by
 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_mutex.  The
+ * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
+ * cpuset are short of memory, might require taking the callback_lock.
- * mutex.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 *      TIF_MEMDIE   - any node ok
 *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- *    the code that might scan up ancestor cpusets and sleep.
 */
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
        struct cpuset *cs;              /* current cpuset ancestors */
        int allowed;                    /* is allocation in zone z allowed? */
+        unsigned long flags;
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                return 1;
-        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        /*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
                return 1;
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-        mutex_lock(&callback_mutex);
+        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();
-        mutex_unlock(&callback_mutex);
+        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
 }
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * yes.  If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt.  It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
-        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
-                return 1;
-        if (node_isset(node, current->mems_allowed))
-                return 1;
-        /*
-         * Allow tasks that have access to memory reserves because they have
-         * been OOM killed to get memory anywhere.
-         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE)))
-                return 1;
-        return 0;
-}
 /**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 void irq_work_tick(void)
 {
-        struct llist_head *raised = &__get_cpu_var(raised_list);
+        struct llist_head *raised = this_cpu_ptr(&raised_list);
        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);
-        irq_work_run_list(&__get_cpu_var(lazy_list));
+        irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 831978cebf1d..06f58309fed2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
        return ret;
 }
-static int check_kprobe_address_safe(struct kprobe *p,
+int __weak arch_check_ftrace_location(struct kprobe *p)
-                                     struct module **probed_mod)
 {
-        int ret = 0;
        unsigned long ftrace_addr;
-        /*
-         * If the address is located on a ftrace nop, set the
-         * breakpoint to the following instruction.
-         */
        ftrace_addr = ftrace_location((unsigned long)p->addr);
        if (ftrace_addr) {
 #ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
                return -EINVAL;
 #endif
        }
+        return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+                                     struct module **probed_mod)
+{
+        int ret;
+        ret = arch_check_ftrace_location(p);
+        if (ret)
+                return ret;
        jump_label_lock();
        preempt_disable();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..61eea02b53f5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
        /* Initialize mult/shift and max_idle_ns */
        __clocksource_updatefreq_scale(cs, scale, freq);
-        /* Add clocksource to the clcoksource list */
+        /* Add clocksource to the clocksource list */
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1f4356037a7d..4d54b7540585 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;
-        irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
 }
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1af4f8f2ab5d..ab76b7bcb36a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2031,7 +2031,7 @@ void trace_printk_init_buffers(void)
        pr_warning("** trace_printk() being used. Allocating extra memory.  **\n");
        pr_warning("**                                                      **\n");
        pr_warning("** This means that this is a DEBUG kernel and it is     **\n");
-        pr_warning("** unsafe for produciton use.                           **\n");
+        pr_warning("** unsafe for production use.                           **\n");
        pr_warning("**                                                      **\n");
        pr_warning("** If you see this message and you are not debugging    **\n");
        pr_warning("** the kernel, report this immediately to your vendor!  **\n");
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685daee3d..6202b08f1933 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
        struct worker_pool *pool = (void *)__pool;
        struct work_struct *work;
-        spin_lock_irq(&wq_mayday_lock);         /* for wq->maydays */
+        spin_lock_irq(&pool->lock);
-        spin_lock(&pool->lock);
+        spin_lock(&wq_mayday_lock);             /* for wq->maydays */
        if (need_to_create_worker(pool)) {
                /*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
-        spin_unlock(&pool->lock);
+        spin_unlock(&wq_mayday_lock);
-        spin_unlock_irq(&wq_mayday_lock);
+        spin_unlock_irq(&pool->lock);
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -2248,12 +2248,30 @@ repeat:
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
-                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+                WARN_ON_ONCE(!list_empty(scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
-                process_scheduled_works(rescuer);
+                if (!list_empty(scheduled)) {
+                        process_scheduled_works(rescuer);
+                        /*
+                         * The above execution of rescued work items could
+                         * have created more to rescue through
+                         * pwq_activate_first_delayed() or chained
+                         * queueing.  Let's put @pwq back on mayday list so
+                         * that such back-to-back work items, which may be
+                         * being used to relieve memory pressure, don't
+                         * incur MAYDAY_INTERVAL delay inbetween.
+                         */
+                        if (need_to_create_worker(pool)) {
+                                spin_lock(&wq_mayday_lock);
+                                get_pwq(pwq);
+                                list_move_tail(&pwq->mayday_node, &wq->maydays);
+                                spin_unlock(&wq_mayday_lock);
+                        }
+                }
                /*
                 * Put the reference grabbed by send_mayday().  @pool won't