24 files changed, 345 insertions, 225 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..e2636737b69b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
        if (array_size >= U32_MAX - PAGE_SIZE ||
-            elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+            bpf_array_alloc_percpu(array)) {
                bpf_map_area_free(array);
                return ERR_PTR(-ENOMEM);
        }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e745d6a88224 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -69,7 +69,7 @@ static LIST_HEAD(dev_map_list);
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
-        return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+        return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -78,6 +78,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        int err = -EINVAL;
        u64 cost;
+        if (!capable(CAP_NET_ADMIN))
+                return ERR_PTR(-EPERM);
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
@@ -111,8 +114,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        err = -ENOMEM;
        /* A per cpu bitfield with a bit per possible net device */
-        dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+        dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
-                                            __alignof__(unsigned long));
+                                                __alignof__(unsigned long),
+                                                GFP_KERNEL | __GFP_NOWARN);
        if (!dtab->flush_needed)
                goto free_dtab;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..6533f08d1238 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                 */
                goto free_htab;
-        if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
-                /* make sure the size for pcpu_alloc() is reasonable */
-                goto free_htab;
        htab->elem_size = sizeof(struct htab_elem) +
                          round_up(htab->map.key_size, 8);
        if (percpu)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
        putname(pname);
        return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..66f00a2b27f4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -39,6 +39,7 @@
 #include <linux/workqueue.h>
 #include <linux/list.h>
 #include <net/strparser.h>
+#include <net/tcp.h>
 struct bpf_stab {
        struct bpf_map map;
@@ -92,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
        return rcu_dereference_sk_user_data(sk);
 }
+/* compute the linear packet data range [data, data_end) for skb when
+ * sk_skb type programs are in use.
+ */
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+        TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
        struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -101,12 +110,20 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
                return SK_DROP;
        skb_orphan(skb);
+        /* We need to ensure that BPF metadata for maps is also cleared
+         * when we orphan the skb so that we don't have the possibility
+         * to reference a stale map.
+         */
+        TCP_SKB_CB(skb)->bpf.map = NULL;
        skb->sk = psock->sock;
-        bpf_compute_data_end(skb);
+        bpf_compute_data_end_sk_skb(skb);
+        preempt_disable();
        rc = (*prog->bpf_func)(skb, prog->insnsi);
+        preempt_enable();
        skb->sk = NULL;
-        return rc;
+        return rc == SK_PASS ?
+                (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP;
 }
 static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
@@ -114,17 +131,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
        struct sock *sk;
        int rc;
-        /* Because we use per cpu values to feed input from sock redirect
-         * in BPF program to do_sk_redirect_map() call we need to ensure we
-         * are not preempted. RCU read lock is not sufficient in this case
-         * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
-         */
-        preempt_disable();
        rc = smap_verdict_func(psock, skb);
        switch (rc) {
        case SK_REDIRECT:
-                sk = do_sk_redirect_map();
+                sk = do_sk_redirect_map(skb);
-                preempt_enable();
                if (likely(sk)) {
                        struct smap_psock *peer = smap_psock_sk(sk);
@@ -141,8 +151,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
        /* Fall through and free skb otherwise */
        case SK_DROP:
        default:
-                if (rc != SK_REDIRECT)
-                        preempt_enable();
                kfree_skb(skb);
        }
 }
@@ -369,7 +377,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
         * any socket yet.
         */
        skb->sk = psock->sock;
-        bpf_compute_data_end(skb);
+        bpf_compute_data_end_sk_skb(skb);
        rc = (*prog->bpf_func)(skb, prog->insnsi);
        skb->sk = NULL;
        rcu_read_unlock();
@@ -487,6 +495,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
        int err = -EINVAL;
        u64 cost;
+        if (!capable(CAP_NET_ADMIN))
+                return ERR_PTR(-EPERM);
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
@@ -840,6 +851,12 @@ static int sock_map_update_elem(struct bpf_map *map,
                return -EINVAL;
        }
+        if (skops.sk->sk_type != SOCK_STREAM ||
+            skops.sk->sk_protocol != IPPROTO_TCP) {
+                fput(socket->file);
+                return -EOPNOTSUPP;
+        }
        err = sock_map_ctx_update_elem(&skops, map, key, flags);
        fput(socket->file);
        return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..c48ca2a34b5e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 {
        struct bpf_verifier_state *parent = state->parent;
+        if (regno == BPF_REG_FP)
+                /* We don't need to worry about FP liveness because it's read-only */
+                return;
        while (parent) {
                /* if read wasn't screened by an earlier write ... */
                if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -1112,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                /* ctx accesses must be at a fixed offset, so that we can
                 * determine what type of data were returned.
                 */
-                if (!tnum_is_const(reg->var_off)) {
+                if (reg->off) {
+                        verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+                                regno, reg->off, off - reg->off);
+                        return -EACCES;
+                }
+                if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
                        char tn_buf[48];
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -1120,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                                tn_buf, off, size);
                        return -EACCES;
                }
-                off += reg->var_off.value;
                err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        /* ctx access returns either a scalar, or a
@@ -2345,6 +2353,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                 * copy register state to dest reg
                                 */
                                regs[insn->dst_reg] = regs[insn->src_reg];
+                                regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
                        } else {
                                /* R1 = (u32) R2 */
                                if (is_pointer_value(env, insn->src_reg)) {
@@ -2421,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-                                   struct bpf_reg_state *dst_reg)
+                                   struct bpf_reg_state *dst_reg,
+                                   bool range_right_open)
 {
        struct bpf_reg_state *regs = state->regs, *reg;
+        u16 new_range;
        int i;
-        if (dst_reg->off < 0)
+        if (dst_reg->off < 0 ||
+            (dst_reg->off == 0 && range_right_open))
                /* This doesn't give us any range */
                return;
@@ -2437,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
                 */
                return;
-        /* LLVM can generate four kind of checks:
+        new_range = dst_reg->off;
+        if (range_right_open)
+                new_range--;
+        /* Examples for register markings:
         *
-         * Type 1/2:
+         * pkt_data in dst register:
         *
         *   r2 = r3;
         *   r2 += 8;
@@ -2456,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
-         * Type 3/4:
+         * pkt_data in src register:
         *
         *   r2 = r3;
         *   r2 += 8;
@@ -2474,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
         *     r3=pkt(id=n,off=0,r=0)
         *
         * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
-         * so that range of bytes [r3, r3 + 8) is safe to access.
+         * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
+         * and [r3, r3 + 8-1) respectively is safe to access depending on
+         * the check.
         */
        /* If our ids match, then we must have the same max_value.  And we
@@ -2485,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
        for (i = 0; i < MAX_BPF_REG; i++)
                if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
                        /* keep the maximum range already checked */
-                        regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
+                        regs[i].range = max(regs[i].range, new_range);
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] != STACK_SPILL)
                        continue;
                reg = &state->spilled_regs[i / BPF_REG_SIZE];
                if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
-                        reg->range = max_t(u16, reg->range, dst_reg->off);
+                        reg->range = max(reg->range, new_range);
        }
 }
@@ -2856,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
                   dst_reg->type == PTR_TO_PACKET &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-                find_good_pkt_pointers(this_branch, dst_reg);
+                /* pkt_data' > pkt_end */
+                find_good_pkt_pointers(this_branch, dst_reg, false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+                   dst_reg->type == PTR_TO_PACKET_END &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end > pkt_data' */
+                find_good_pkt_pointers(other_branch, &regs[insn->src_reg], true);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
                   dst_reg->type == PTR_TO_PACKET &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-                find_good_pkt_pointers(other_branch, dst_reg);
+                /* pkt_data' < pkt_end */
+                find_good_pkt_pointers(other_branch, dst_reg, true);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+                   dst_reg->type == PTR_TO_PACKET_END &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end < pkt_data' */
+                find_good_pkt_pointers(this_branch, &regs[insn->src_reg], false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+                   dst_reg->type == PTR_TO_PACKET &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+                /* pkt_data' >= pkt_end */
+                find_good_pkt_pointers(this_branch, dst_reg, true);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
                   dst_reg->type == PTR_TO_PACKET_END &&
                   regs[insn->src_reg].type == PTR_TO_PACKET) {
-                find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+                /* pkt_end >= pkt_data' */
+                find_good_pkt_pointers(other_branch, &regs[insn->src_reg], false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+                   dst_reg->type == PTR_TO_PACKET &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+                /* pkt_data' <= pkt_end */
+                find_good_pkt_pointers(other_branch, dst_reg, false);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
                   dst_reg->type == PTR_TO_PACKET_END &&
                   regs[insn->src_reg].type == PTR_TO_PACKET) {
-                find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+                /* pkt_end <= pkt_data' */
+                find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
        } else if (is_pointer_value(env, insn->dst_reg)) {
                verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
                return -EACCES;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d851df22f5c5..04892a82f6ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                __cpuhp_kick_ap(st);
        }
+        /*
+         * Clean up the leftovers so the next hotplug operation wont use stale
+         * data.
+         */
+        st->node = st->last = NULL;
        return ret;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..9d93db81fa36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
        /*
         * Do not update time when cgroup is not active
         */
-        if (cgrp == event->cgrp)
+       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                __update_cgrp_time(event->cgrp);
 }
@@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 static void free_pmu_context(struct pmu *pmu)
 {
+        /*
+         * Static contexts such as perf_sw_context have a global lifetime
+         * and may be shared between different PMUs. Avoid freeing them
+         * when a single PMU is going away.
+         */
+        if (pmu->task_ctx_nr > perf_invalid_context)
+                return;
        mutex_lock(&pmus_lock);
        free_percpu(pmu->pmu_cpu_context);
        mutex_unlock(&pmus_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cd53e92147..f6cad39f35df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        if (!infop)
                return err;
+        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+                return -EFAULT;
        user_access_begin();
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
        if (!infop)
                return err;
+        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+                return -EFAULT;
        user_access_begin();
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                if (!s)
                        continue;
+#ifdef CONFIG_DEBUG_KMEMLEAK
+                /* Clear stale pointers from reused stack. */
+                memset(s->addr, 0, THREAD_SIZE);
+#endif
                tsk->stack_vm_area = s;
                return s->addr;
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..5a2ef92c2782 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
                        irq_setup_affinity(desc);
                        break;
                case IRQ_STARTUP_MANAGED:
+                        irq_do_set_affinity(d, aff, false);
                        ret = __irq_startup(desc);
-                        irq_set_affinity_locked(d, aff, false);
                        break;
                case IRQ_STARTUP_ABORT:
                        return 0;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
 static inline bool irq_needs_fixup(struct irq_data *d)
 {
        const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+        unsigned int cpu = smp_processor_id();
-        return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+        /*
+         * The cpumask_empty() check is a workaround for interrupt chips,
+         * which do not implement effective affinity, but the architecture has
+         * enabled the config switch. Use the general affinity mask instead.
+         */
+        if (cpumask_empty(m))
+                m = irq_data_get_affinity_mask(d);
+        /*
+         * Sanity check. If the mask is not empty when excluding the outgoing
+         * CPU then it must contain at least one online CPU. The outgoing CPU
+         * has been removed from the online mask already.
+         */
+        if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+            cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+                /*
+                 * If this happens then there was a missed IRQ fixup at some
+                 * point. Warn about it and enforce fixup.
+                 */
+                pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+                        cpumask_pr_args(m), d->irq, cpu);
+                return true;
+        }
+#endif
+        return cpumask_test_cpu(cpu, m);
 }
 static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 5270a54b9fa4..c26c5bb6b491 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
 }
 /**
- * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
+ * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt
 * @d: irq_data
+ *
+ * This generic implementation of the irq_mask_ack method is for chips
+ * with separate enable/disable registers instead of a single mask
+ * register and where a pending interrupt is acknowledged by setting a
+ * bit.
+ *
+ * Note: This is the only permutation currently used.  Similar generic
+ * functions should be added here if other permutations are required.
 */
-void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
        struct irq_chip_type *ct = irq_data_get_chip_type(d);
        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(gc, mask, ct->regs.mask);
+        irq_reg_writel(gc, mask, ct->regs.disable);
+        *ct->mask_cache &= ~mask;
        irq_reg_writel(gc, mask, ct->regs.ack);
        irq_gc_unlock(gc);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d00132b5c325..4bff6a10ae8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
                        set_bit(IRQTF_AFFINITY, &action->thread_flags);
 }
+static void irq_validate_effective_affinity(struct irq_data *data)
+{
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+        const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        if (!cpumask_empty(m))
+                return;
+        pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+                     chip->name, data->irq);
+#endif
+}
 int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
                        bool force)
 {
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        int ret;
+        if (!chip || !chip->irq_set_affinity)
+                return -EINVAL;
        ret = chip->irq_set_affinity(data, mask, force);
        switch (ret) {
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
                cpumask_copy(desc->irq_common_data.affinity, mask);
        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_validate_effective_affinity(data);
                irq_set_thread_affinity(desc);
                ret = 0;
        }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..bf8c8fd72589 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
 }
 EXPORT_SYMBOL_GPL(klp_register_patch);
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+                                               struct klp_patch *limit)
+{
+        struct klp_patch *patch;
+        struct klp_object *obj;
+        list_for_each_entry(patch, &klp_patches, list) {
+                if (patch == limit)
+                        break;
+                klp_for_each_object(patch, obj) {
+                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+                                continue;
+                        /*
+                         * Only unpatch the module if the patch is enabled or
+                         * is in transition.
+                         */
+                        if (patch->enabled || patch == klp_transition_patch) {
+                                pr_notice("reverting patch '%s' on unloading module '%s'\n",
+                                          patch->mod->name, obj->mod->name);
+                                klp_unpatch_object(obj);
+                        }
+                        klp_free_object_loaded(obj);
+                        break;
+                }
+        }
+}
 int klp_module_coming(struct module *mod)
 {
        int ret;
@@ -894,7 +929,7 @@ err:
        pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
                patch->mod->name, obj->mod->name, obj->mod->name);
        mod->klp_alive = false;
-        klp_free_object_loaded(obj);
+        klp_cleanup_module_patches_limited(mod, patch);
        mutex_unlock(&klp_mutex);
        return ret;
@@ -902,9 +937,6 @@ err:
 void klp_module_going(struct module *mod)
 {
-        struct klp_patch *patch;
-        struct klp_object *obj;
        if (WARN_ON(mod->state != MODULE_STATE_GOING &&
                    mod->state != MODULE_STATE_COMING))
                return;
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
         */
        mod->klp_alive = false;
-        list_for_each_entry(patch, &klp_patches, list) {
+        klp_cleanup_module_patches_limited(mod, NULL);
-                klp_for_each_object(patch, obj) {
-                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
-                                continue;
-                        /*
-                         * Only unpatch the module if the patch is enabled or
-                         * is in transition.
-                         */
-                        if (patch->enabled || patch == klp_transition_patch) {
-                                pr_notice("reverting patch '%s' on unloading module '%s'\n",
-                                          patch->mod->name, obj->mod->name);
-                                klp_unpatch_object(obj);
-                        }
-                        klp_free_object_loaded(obj);
-                        break;
-                }
-        }
        mutex_unlock(&klp_mutex);
 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..e36e652d996f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
               struct held_lock *next, int distance, struct stack_trace *trace,
               int (*save)(struct stack_trace *trace))
 {
+        struct lock_list *uninitialized_var(target_entry);
        struct lock_list *entry;
-        int ret;
        struct lock_list this;
-        struct lock_list *uninitialized_var(target_entry);
+        int ret;
        /*
         * Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        this.class = hlock_class(next);
        this.parent = NULL;
        ret = check_noncircular(&this, hlock_class(prev), &target_entry);
-        if (unlikely(!ret))
+        if (unlikely(!ret)) {
+                if (!trace->entries) {
+                        /*
+                         * If @save fails here, the printing might trigger
+                         * a WARN but because of the !nr_entries it should
+                         * not do bad things.
+                         */
+                        save(trace);
+                }
                return print_circular_bug(&this, target_entry, next, prev, trace);
+        }
        else if (unlikely(ret < 0))
                return print_bfs_bug(ret);
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                return print_bfs_bug(ret);
-        if (save && !save(trace))
+        if (!trace->entries && !save(trace))
                return 0;
        /*
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        if (!ret)
                return 0;
-        /*
-         * Debugging printouts:
-         */
-        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
-                graph_unlock();
-                printk("\n new dependency: ");
-                print_lock_name(hlock_class(prev));
-                printk(KERN_CONT " => ");
-                print_lock_name(hlock_class(next));
-                printk(KERN_CONT "\n");
-                dump_stack();
-                if (!graph_lock())
-                        return 0;
-        }
        return 2;
 }
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
        int depth = curr->lockdep_depth;
        struct held_lock *hlock;
-        struct stack_trace trace;
+        struct stack_trace trace = {
-        int (*save)(struct stack_trace *trace) = save_trace;
+                .nr_entries = 0,
+                .max_entries = 0,
+                .entries = NULL,
+                .skip = 0,
+        };
        /*
         * Debugging checks.
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                         */
                        if (hlock->read != 2 && hlock->check) {
                                int ret = check_prev_add(curr, hlock, next,
-                                                         distance, &trace, save);
+                                                         distance, &trace, save_trace);
                                if (!ret)
                                        return 0;
                                /*
-                                 * Stop saving stack_trace if save_trace() was
-                                 * called at least once:
-                                 */
-                                if (save && ret == 2)
-                                        save = NULL;
-                                /*
                                 * Stop after the first non-trylock entry,
                                 * as non-trylock entries have added their
                                 * own direct dependencies already, so this
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 /**
 * call_srcu() - Queue a callback for invocation after an SRCU grace period
 * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
 * @func: function to be invoked after the SRCU grace period
 *
 * The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 /**
+ * rcu_sync_enter_start - Force readers onto slow path for multiple updates
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
 * Must be called after rcu_sync_init() and before first use.
 *
 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 /**
 * rcu_sync_func() - Callback function managing reader access to fastpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
 *
 * This function is passed to one of the call_rcu() functions by
 * rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 * rcu_sync_exit().  Otherwise, set all state back to idle so that readers
 * can again use their fastpaths.
 */
-static void rcu_sync_func(struct rcu_head *rcu)
+static void rcu_sync_func(struct rcu_head *rhp)
 {
-        struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+        struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
        unsigned long flags;
        BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..3e3650e94ae6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 * read-side critical sections have completed. call_rcu_sched() assumes
 * that the read-side critical sections end on enabling of preemption
 * or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
+ * RCU read-side critical sections are delimited by:
- *  - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ *
- *  - anything that disables preemption.
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
 *
 *  These may be nested.
 *
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 * handler. This means that read-side critical sections in process
 * context must not be interrupted by softirqs. This interface is to be
 * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
+ * RCU read-side critical sections are delimited by:
- *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *
- *  OR
+ * - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context, OR
- *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- *  These may be nested.
+ *
+ * These may be nested.
 *
 * See the description of call_rcu() for more detailed information on
 * memory ordering guarantees.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-struct llc_stats {
+/*
-        unsigned long   nr_running;
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
-        unsigned long   load;
+ * soonest. For the purpose of speed we only consider the waking and previous
-        unsigned long   capacity;
+ * CPU.
-        int             has_capacity;
+ *
-};
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                      will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                        scheduling latency of the CPUs. This seems to work
+ *                        for the overloaded case.
+ */
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int prev_cpu, int sync)
 {
-        struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (idle_cpu(this_cpu))
+                return true;
-        if (!sds)
-                return false;
-        stats->nr_running       = READ_ONCE(sds->nr_running);
+        if (sync && cpu_rq(this_cpu)->nr_running == 1)
-        stats->load             = READ_ONCE(sds->load);
+                return true;
-        stats->capacity         = READ_ONCE(sds->capacity);
-        stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
-        return true;
+        return false;
 }
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
+                   int this_cpu, int prev_cpu, int sync)
 {
-        struct llc_stats prev_stats, this_stats;
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
-        if (!get_llc_stats(&prev_stats, prev_cpu) ||
+        this_eff_load = target_load(this_cpu, sd->wake_idx);
-            !get_llc_stats(&this_stats, this_cpu))
+        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
-                return false;
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current LLC.
-         */
        if (sync) {
                unsigned long current_load = task_h_load(current);
-                /* in this case load hits 0 and this LLC is considered 'idle' */
+                if (current_load > this_eff_load)
-                if (current_load > this_stats.load)
                        return true;
-                this_stats.load -= current_load;
+                this_eff_load -= current_load;
        }
-        /*
-         * The has_capacity stuff is not SMT aware, but by trying to balance
-         * the nr_running on both ends we try and fill the domain at equal
-         * rates, thereby first consuming cores before siblings.
-         */
-        /* if the old cache has capacity, stay there */
-        if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-                return false;
-        /* if this cache has capacity, come here */
-        if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
-                return true;
-        /*
-         * Check to see if we can move the load without causing too much
-         * imbalance.
-         */
        task_load = task_h_load(p);
-        this_eff_load = 100;
+        this_eff_load += task_load;
-        this_eff_load *= prev_stats.capacity;
+        if (sched_feat(WA_BIAS))
+                this_eff_load *= 100;
-        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+        this_eff_load *= capacity_of(prev_cpu);
-        prev_eff_load *= this_stats.capacity;
-        this_eff_load *= this_stats.load + task_load;
+        prev_eff_load -= task_load;
-        prev_eff_load *= prev_stats.load - task_load;
+        if (sched_feat(WA_BIAS))
+                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+        prev_eff_load *= capacity_of(this_cpu);
        return this_eff_load <= prev_eff_load;
 }
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                       int prev_cpu, int sync)
 {
        int this_cpu = smp_processor_id();
-        bool affine;
+        bool affine = false;
-        /*
+        if (sched_feat(WA_IDLE) && !affine)
-         * Default to no affine wakeups; wake_affine() should not effect a task
+                affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
-         * placement the load-balancer feels inclined to undo. The conservative
-         * option is therefore to not move tasks when they wake up.
-         */
-        affine = false;
-        /*
+        if (sched_feat(WA_WEIGHT) && !affine)
-         * If the wakeup is across cache domains, try to evaluate if movement
+                affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
-         * makes sense, otherwise rely on select_idle_siblings() to do
-         * placement inside the cache domain.
-         */
-        if (!cpus_share_cache(prev_cpu, this_cpu))
-                affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-        struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-        if (!shared)
-                return;
-        /*
-         * Since these are sums over groups they can contain some CPUs
-         * multiple times for the NUMA domains.
-         *
-         * Currently only wake_affine_llc() and find_busiest_group()
-         * uses these numbers, only the last is affected by this problem.
-         *
-         * XXX fix that.
-         */
-        WRITE_ONCE(shared->nr_running,  sds->total_running);
-        WRITE_ONCE(shared->load,        sds->total_load);
-        WRITE_ONCE(shared->capacity,    sds->total_capacity);
 }
 /**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
        int cpu, balance_cpu = -1;
        /*
+         * Ensure the balancing environment is consistent; can happen
+         * when the softirq triggers 'during' hotplug.
+         */
+        if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+                return 0;
+        /*
         * In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
 #include <linux/membarrier.h>
 #include <linux/tick.h>
 #include <linux/cpumask.h>
+#include <linux/atomic.h>
 #include "sched.h"      /* for cpu_rq(). */
@@ -26,21 +27,26 @@
 * except MEMBARRIER_CMD_QUERY.
 */
 #define MEMBARRIER_CMD_BITMASK  \
-        (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+        (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED       \
+        | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 static void ipi_mb(void *info)
 {
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
 {
        int cpu;
        bool fallback = false;
        cpumask_var_t tmpmask;
+        if (!(atomic_read(&current->mm->membarrier_state)
+                        & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+                return -EPERM;
        if (num_online_cpus() == 1)
-                return;
+                return 0;
        /*
         * Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
+        return 0;
+}
+static void membarrier_register_private_expedited(void)
+{
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        /*
+         * We need to consider threads belonging to different thread
+         * groups, which use the same mm. (CLONE_VM but not
+         * CLONE_THREAD).
+         */
+        if (atomic_read(&mm->membarrier_state)
+                        & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+                return;
+        atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+                        &mm->membarrier_state);
 }
 /**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
                        synchronize_sched();
                return 0;
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-                membarrier_private_expedited();
+                return membarrier_private_expedited();
+        case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+                membarrier_register_private_expedited();
                return 0;
        default:
                return -EINVAL;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bb3a38005b9c..0ae832e13b97 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
        return 0;
 }
-void __get_seccomp_filter(struct seccomp_filter *filter)
+static void __get_seccomp_filter(struct seccomp_filter *filter)
 {
        /* Reference count is bounded by the number of total processes. */
        refcount_inc(&filter->usage);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..a2dccfe1acec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,7 @@ enum {
         * attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         */
+        POOL_MANAGER_ACTIVE     = 1 << 0,       /* being managed */
        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
        /* worker flags */
@@ -165,7 +166,6 @@ struct worker_pool {
                                                /* L: hash of busy workers */
        /* see manage_workers() for details on the two manager mutexes */
-        struct mutex            manager_arb;    /* manager arbitration */
        struct worker           *manager;       /* L: purely informational */
        struct mutex            attach_mutex;   /* attach/detach exclusion */
        struct list_head        workers;        /* A: attached workers */
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);     /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
 static LIST_HEAD(workqueues);           /* PR: list of all workqueues */
 static bool workqueue_freezing;         /* PL: have wqs started freezing? */
@@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = mutex_is_locked(&pool->manager_arb);
+        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
@@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        /*
+        if (pool->flags & POOL_MANAGER_ACTIVE)
-         * Anyone who successfully grabs manager_arb wins the arbitration
-         * and becomes the manager.  mutex_trylock() on pool->manager_arb
-         * failure while holding pool->lock reliably indicates that someone
-         * else is managing the pool and the worker which failed trylock
-         * can proceed to executing work items.  This means that anyone
-         * grabbing manager_arb is responsible for actually performing
-         * manager duties.  If manager_arb is grabbed and released without
-         * actual management, the pool may stall indefinitely.
-         */
-        if (!mutex_trylock(&pool->manager_arb))
                return false;
+        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;
        maybe_create_worker(pool);
        pool->manager = NULL;
-        mutex_unlock(&pool->manager_arb);
+        pool->flags &= ~POOL_MANAGER_ACTIVE;
+        wake_up(&wq_manager_wait);
        return true;
 }
@@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool)
        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                    (unsigned long)pool);
-        mutex_init(&pool->manager_arb);
        mutex_init(&pool->attach_mutex);
        INIT_LIST_HEAD(&pool->workers);
@@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool)
        hash_del(&pool->hash_node);
        /*
-         * Become the manager and destroy all workers.  Grabbing
+         * Become the manager and destroy all workers.  This prevents
-         * manager_arb prevents @pool's workers from blocking on
+         * @pool's workers from blocking on attach_mutex.  We're the last
-         * attach_mutex.
+         * manager and @pool gets freed with the flag set.
         */
-        mutex_lock(&pool->manager_arb);
        spin_lock_irq(&pool->lock);
+        wait_event_lock_irq(wq_manager_wait,
+                            !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+        pool->flags |= POOL_MANAGER_ACTIVE;
        while ((worker = first_idle_worker(pool)))
                destroy_worker(worker);
        WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool)
        if (pool->detach_completion)
                wait_for_completion(pool->detach_completion);
-        mutex_unlock(&pool->manager_arb);
        /* shut down the timers */
        del_timer_sync(&pool->idle_timer);
        del_timer_sync(&pool->mayday_timer);