Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

There were quite a few overlapping sets of changes here. Daniel's bug fix for off-by-ones in the new BPF branch instructions, along with the added allowances for "data_end > ptr + x" forms collided with the metadata additions. Along with those three changes came veritifer test cases, which in their final form I tried to group together properly. If I had just trimmed GIT's conflict tags as-is, this would have split up the meta tests unnecessarily. In the socketmap code, a set of preemption disabling changes overlapped with the rename of bpf_compute_data_end() to bpf_compute_data_pointers(). Changes were made to the mv88e6060.c driver set addr method which got removed in net-next. The hyperv transport socket layer had a locking change in 'net' which overlapped with a change of socket state macro usage in 'net-next'. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2017-10-22 08:36:53 -0400
committer: David S. Miller <davem@davemloft.net> 2017-10-22 08:39:14 -0400
commit: f8ddadc4db6c7b7029b6d0e0d9af24f74ad27ca2 (patch)
tree: 0a6432aba336bae42313613f4c891bcfce02bd4e /kernel
parent: bdd091bab8c631bd2801af838e344fad34566410 (diff)
parent: b5ac3beb5a9f0ef0ea64cd85faf94c0dc4de0e42 (diff)
20 files changed, 308 insertions, 201 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 988c04c91e10..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -102,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
        if (array_size >= U32_MAX - PAGE_SIZE ||
-            elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+            bpf_array_alloc_percpu(array)) {
                bpf_map_area_free(array);
                return ERR_PTR(-ENOMEM);
        }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e5d3de7cff2e..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -72,7 +72,7 @@ static LIST_HEAD(dev_map_list);
 static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 {
-        return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+        return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
 }
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -81,6 +81,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        int err = -EINVAL;
        u64 cost;
+        if (!capable(CAP_NET_ADMIN))
+                return ERR_PTR(-EPERM);
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
@@ -114,8 +117,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        err = -ENOMEM;
        /* A per cpu bitfield with a bit per possible net device */
-        dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+        dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
-                                            __alignof__(unsigned long));
+                                                __alignof__(unsigned long),
+                                                GFP_KERNEL | __GFP_NOWARN);
        if (!dtab->flush_needed)
                goto free_dtab;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 919955236e63..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -318,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                 */
                goto free_htab;
-        if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
-                /* make sure the size for pcpu_alloc() is reasonable */
-                goto free_htab;
        htab->elem_size = sizeof(struct htab_elem) +
                          round_up(htab->map.key_size, 8);
        if (percpu)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 86ec846f2d5e..eef843c3b419 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -39,6 +39,7 @@
 #include <linux/workqueue.h>
 #include <linux/list.h>
 #include <net/strparser.h>
+#include <net/tcp.h>
 #define SOCK_CREATE_FLAG_MASK \
        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -104,9 +105,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
                return SK_DROP;
        skb_orphan(skb);
+        /* We need to ensure that BPF metadata for maps is also cleared
+         * when we orphan the skb so that we don't have the possibility
+         * to reference a stale map.
+         */
+        TCP_SKB_CB(skb)->bpf.map = NULL;
        skb->sk = psock->sock;
        bpf_compute_data_pointers(skb);
+        preempt_disable();
        rc = (*prog->bpf_func)(skb, prog->insnsi);
+        preempt_enable();
        skb->sk = NULL;
        return rc;
@@ -117,17 +125,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
        struct sock *sk;
        int rc;
-        /* Because we use per cpu values to feed input from sock redirect
-         * in BPF program to do_sk_redirect_map() call we need to ensure we
-         * are not preempted. RCU read lock is not sufficient in this case
-         * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
-         */
-        preempt_disable();
        rc = smap_verdict_func(psock, skb);
        switch (rc) {
        case SK_REDIRECT:
-                sk = do_sk_redirect_map();
+                sk = do_sk_redirect_map(skb);
-                preempt_enable();
                if (likely(sk)) {
                        struct smap_psock *peer = smap_psock_sk(sk);
@@ -144,8 +145,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
        /* Fall through and free skb otherwise */
        case SK_DROP:
        default:
-                if (rc != SK_REDIRECT)
-                        preempt_enable();
                kfree_skb(skb);
        }
 }
@@ -490,6 +489,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
        int err = -EINVAL;
        u64 cost;
+        if (!capable(CAP_NET_ADMIN))
+                return ERR_PTR(-EPERM);
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
@@ -843,6 +845,12 @@ static int sock_map_update_elem(struct bpf_map *map,
                return -EINVAL;
        }
+        if (skops.sk->sk_type != SOCK_STREAM ||
+            skops.sk->sk_protocol != IPPROTO_TCP) {
+                fput(socket->file);
+                return -EOPNOTSUPP;
+        }
        err = sock_map_ctx_update_elem(&skops, map, key, flags);
        fput(socket->file);
        return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 545b8c45a578..d906775e12c1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1006,7 +1006,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                /* ctx accesses must be at a fixed offset, so that we can
                 * determine what type of data were returned.
                 */
-                if (!tnum_is_const(reg->var_off)) {
+                if (reg->off) {
+                        verbose(env,
+                                "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+                                regno, reg->off, off - reg->off);
+                        return -EACCES;
+                }
+                if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
                        char tn_buf[48];
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -1015,7 +1021,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                                tn_buf, off, size);
                        return -EACCES;
                }
-                off += reg->var_off.value;
                err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        /* ctx access returns either a scalar, or a
@@ -2341,12 +2346,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
                                   struct bpf_reg_state *dst_reg,
-                                   enum bpf_reg_type type)
+                                   enum bpf_reg_type type,
+                                   bool range_right_open)
 {
        struct bpf_reg_state *regs = state->regs, *reg;
+        u16 new_range;
        int i;
-        if (dst_reg->off < 0)
+        if (dst_reg->off < 0 ||
+            (dst_reg->off == 0 && range_right_open))
                /* This doesn't give us any range */
                return;
@@ -2357,9 +2365,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
                 */
                return;
-        /* LLVM can generate four kind of checks:
+        new_range = dst_reg->off;
+        if (range_right_open)
+                new_range--;
+        /* Examples for register markings:
         *
-         * Type 1/2:
+         * pkt_data in dst register:
         *
         *   r2 = r3;
         *   r2 += 8;
@@ -2376,7 +2388,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
-         * Type 3/4:
+         * pkt_data in src register:
         *
         *   r2 = r3;
         *   r2 += 8;
@@ -2394,7 +2406,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
         *     r3=pkt(id=n,off=0,r=0)
         *
         * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
-         * so that range of bytes [r3, r3 + 8) is safe to access.
+         * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
+         * and [r3, r3 + 8-1) respectively is safe to access depending on
+         * the check.
         */
        /* If our ids match, then we must have the same max_value.  And we
@@ -2405,14 +2419,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
        for (i = 0; i < MAX_BPF_REG; i++)
                if (regs[i].type == type && regs[i].id == dst_reg->id)
                        /* keep the maximum range already checked */
-                        regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
+                        regs[i].range = max(regs[i].range, new_range);
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] != STACK_SPILL)
                        continue;
                reg = &state->spilled_regs[i / BPF_REG_SIZE];
                if (reg->type == type && reg->id == dst_reg->id)
-                        reg->range = max_t(u16, reg->range, dst_reg->off);
+                        reg->range = max_t(u16, reg->range, new_range);
        }
 }
@@ -2776,39 +2790,71 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
                   dst_reg->type == PTR_TO_PACKET &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-                find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
+                /* pkt_data' > pkt_end */
+                find_good_pkt_pointers(this_branch, dst_reg,
+                                       PTR_TO_PACKET, false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+                   dst_reg->type == PTR_TO_PACKET_END &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end > pkt_data' */
+                find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+                                       PTR_TO_PACKET, true);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+                   dst_reg->type == PTR_TO_PACKET &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+                /* pkt_data' < pkt_end */
+                find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET,
+                                       true);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+                   dst_reg->type == PTR_TO_PACKET_END &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end < pkt_data' */
+                find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+                                       PTR_TO_PACKET, false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
                   dst_reg->type == PTR_TO_PACKET &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-                find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
+                /* pkt_data' >= pkt_end */
+                find_good_pkt_pointers(this_branch, dst_reg,
+                                       PTR_TO_PACKET, true);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
                   dst_reg->type == PTR_TO_PACKET_END &&
                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end >= pkt_data' */
                find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-                                       PTR_TO_PACKET);
+                                       PTR_TO_PACKET, false);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+                   dst_reg->type == PTR_TO_PACKET &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+                /* pkt_data' <= pkt_end */
+                find_good_pkt_pointers(other_branch, dst_reg,
+                                       PTR_TO_PACKET, false);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
                   dst_reg->type == PTR_TO_PACKET_END &&
                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                /* pkt_end <= pkt_data' */
                find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-                                       PTR_TO_PACKET);
+                                       PTR_TO_PACKET, true);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
                   dst_reg->type == PTR_TO_PACKET_META &&
                   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-                find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+                find_good_pkt_pointers(this_branch, dst_reg,
+                                       PTR_TO_PACKET_META, false);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
                   dst_reg->type == PTR_TO_PACKET_META &&
                   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-                find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+                find_good_pkt_pointers(other_branch, dst_reg,
+                                       PTR_TO_PACKET_META, false);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
                   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
                find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-                                       PTR_TO_PACKET_META);
+                                       PTR_TO_PACKET_META, false);
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
                   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
                find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-                                       PTR_TO_PACKET_META);
+                                       PTR_TO_PACKET_META, false);
        } else if (is_pointer_value(env, insn->dst_reg)) {
                verbose(env, "R%d pointer comparison prohibited\n",
                        insn->dst_reg);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 902149f05381..31ee304a5844 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
        /*
         * Do not update time when cgroup is not active
         */
-        if (cgrp == event->cgrp)
+       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                __update_cgrp_time(event->cgrp);
 }
@@ -8966,6 +8966,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 static void free_pmu_context(struct pmu *pmu)
 {
+        /*
+         * Static contexts such as perf_sw_context have a global lifetime
+         * and may be shared between different PMUs. Avoid freeing them
+         * when a single PMU is going away.
+         */
+        if (pmu->task_ctx_nr > perf_invalid_context)
+                return;
        mutex_lock(&pmus_lock);
        free_percpu(pmu->pmu_cpu_context);
        mutex_unlock(&pmus_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cd53e92147..f6cad39f35df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        if (!infop)
                return err;
+        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+                return -EFAULT;
        user_access_begin();
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
        if (!infop)
                return err;
+        if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+                return -EFAULT;
        user_access_begin();
        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                if (!s)
                        continue;
+#ifdef CONFIG_DEBUG_KMEMLEAK
+                /* Clear stale pointers from reused stack. */
+                memset(s->addr, 0, THREAD_SIZE);
+#endif
                tsk->stack_vm_area = s;
                return s->addr;
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..5a2ef92c2782 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
                        irq_setup_affinity(desc);
                        break;
                case IRQ_STARTUP_MANAGED:
+                        irq_do_set_affinity(d, aff, false);
                        ret = __irq_startup(desc);
-                        irq_set_affinity_locked(d, aff, false);
                        break;
                case IRQ_STARTUP_ABORT:
                        return 0;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
 static inline bool irq_needs_fixup(struct irq_data *d)
 {
        const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+        unsigned int cpu = smp_processor_id();
-        return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+        /*
+         * The cpumask_empty() check is a workaround for interrupt chips,
+         * which do not implement effective affinity, but the architecture has
+         * enabled the config switch. Use the general affinity mask instead.
+         */
+        if (cpumask_empty(m))
+                m = irq_data_get_affinity_mask(d);
+        /*
+         * Sanity check. If the mask is not empty when excluding the outgoing
+         * CPU then it must contain at least one online CPU. The outgoing CPU
+         * has been removed from the online mask already.
+         */
+        if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+            cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+                /*
+                 * If this happens then there was a missed IRQ fixup at some
+                 * point. Warn about it and enforce fixup.
+                 */
+                pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+                        cpumask_pr_args(m), d->irq, cpu);
+                return true;
+        }
+#endif
+        return cpumask_test_cpu(cpu, m);
 }
 static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d00132b5c325..4bff6a10ae8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
                        set_bit(IRQTF_AFFINITY, &action->thread_flags);
 }
+static void irq_validate_effective_affinity(struct irq_data *data)
+{
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+        const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        if (!cpumask_empty(m))
+                return;
+        pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+                     chip->name, data->irq);
+#endif
+}
 int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
                        bool force)
 {
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        int ret;
+        if (!chip || !chip->irq_set_affinity)
+                return -EINVAL;
        ret = chip->irq_set_affinity(data, mask, force);
        switch (ret) {
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
                cpumask_copy(desc->irq_common_data.affinity, mask);
        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_validate_effective_affinity(data);
                irq_set_thread_affinity(desc);
                ret = 0;
        }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..bf8c8fd72589 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
 }
 EXPORT_SYMBOL_GPL(klp_register_patch);
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+                                               struct klp_patch *limit)
+{
+        struct klp_patch *patch;
+        struct klp_object *obj;
+        list_for_each_entry(patch, &klp_patches, list) {
+                if (patch == limit)
+                        break;
+                klp_for_each_object(patch, obj) {
+                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+                                continue;
+                        /*
+                         * Only unpatch the module if the patch is enabled or
+                         * is in transition.
+                         */
+                        if (patch->enabled || patch == klp_transition_patch) {
+                                pr_notice("reverting patch '%s' on unloading module '%s'\n",
+                                          patch->mod->name, obj->mod->name);
+                                klp_unpatch_object(obj);
+                        }
+                        klp_free_object_loaded(obj);
+                        break;
+                }
+        }
+}
 int klp_module_coming(struct module *mod)
 {
        int ret;
@@ -894,7 +929,7 @@ err:
        pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
                patch->mod->name, obj->mod->name, obj->mod->name);
        mod->klp_alive = false;
-        klp_free_object_loaded(obj);
+        klp_cleanup_module_patches_limited(mod, patch);
        mutex_unlock(&klp_mutex);
        return ret;
@@ -902,9 +937,6 @@ err:
 void klp_module_going(struct module *mod)
 {
-        struct klp_patch *patch;
-        struct klp_object *obj;
        if (WARN_ON(mod->state != MODULE_STATE_GOING &&
                    mod->state != MODULE_STATE_COMING))
                return;
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
         */
        mod->klp_alive = false;
-        list_for_each_entry(patch, &klp_patches, list) {
+        klp_cleanup_module_patches_limited(mod, NULL);
-                klp_for_each_object(patch, obj) {
-                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
-                                continue;
-                        /*
-                         * Only unpatch the module if the patch is enabled or
-                         * is in transition.
-                         */
-                        if (patch->enabled || patch == klp_transition_patch) {
-                                pr_notice("reverting patch '%s' on unloading module '%s'\n",
-                                          patch->mod->name, obj->mod->name);
-                                klp_unpatch_object(obj);
-                        }
-                        klp_free_object_loaded(obj);
-                        break;
-                }
-        }
        mutex_unlock(&klp_mutex);
 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..e36e652d996f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
               struct held_lock *next, int distance, struct stack_trace *trace,
               int (*save)(struct stack_trace *trace))
 {
+        struct lock_list *uninitialized_var(target_entry);
        struct lock_list *entry;
-        int ret;
        struct lock_list this;
-        struct lock_list *uninitialized_var(target_entry);
+        int ret;
        /*
         * Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        this.class = hlock_class(next);
        this.parent = NULL;
        ret = check_noncircular(&this, hlock_class(prev), &target_entry);
-        if (unlikely(!ret))
+        if (unlikely(!ret)) {
+                if (!trace->entries) {
+                        /*
+                         * If @save fails here, the printing might trigger
+                         * a WARN but because of the !nr_entries it should
+                         * not do bad things.
+                         */
+                        save(trace);
+                }
                return print_circular_bug(&this, target_entry, next, prev, trace);
+        }
        else if (unlikely(ret < 0))
                return print_bfs_bug(ret);
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                return print_bfs_bug(ret);
-        if (save && !save(trace))
+        if (!trace->entries && !save(trace))
                return 0;
        /*
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        if (!ret)
                return 0;
-        /*
-         * Debugging printouts:
-         */
-        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
-                graph_unlock();
-                printk("\n new dependency: ");
-                print_lock_name(hlock_class(prev));
-                printk(KERN_CONT " => ");
-                print_lock_name(hlock_class(next));
-                printk(KERN_CONT "\n");
-                dump_stack();
-                if (!graph_lock())
-                        return 0;
-        }
        return 2;
 }
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
        int depth = curr->lockdep_depth;
        struct held_lock *hlock;
-        struct stack_trace trace;
+        struct stack_trace trace = {
-        int (*save)(struct stack_trace *trace) = save_trace;
+                .nr_entries = 0,
+                .max_entries = 0,
+                .entries = NULL,
+                .skip = 0,
+        };
        /*
         * Debugging checks.
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                         */
                        if (hlock->read != 2 && hlock->check) {
                                int ret = check_prev_add(curr, hlock, next,
-                                                         distance, &trace, save);
+                                                         distance, &trace, save_trace);
                                if (!ret)
                                        return 0;
                                /*
-                                 * Stop saving stack_trace if save_trace() was
-                                 * called at least once:
-                                 */
-                                if (save && ret == 2)
-                                        save = NULL;
-                                /*
                                 * Stop after the first non-trylock entry,
                                 * as non-trylock entries have added their
                                 * own direct dependencies already, so this
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 /**
 * call_srcu() - Queue a callback for invocation after an SRCU grace period
 * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
 * @func: function to be invoked after the SRCU grace period
 *
 * The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 /**
+ * rcu_sync_enter_start - Force readers onto slow path for multiple updates
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
 * Must be called after rcu_sync_init() and before first use.
 *
 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 /**
 * rcu_sync_func() - Callback function managing reader access to fastpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
 *
 * This function is passed to one of the call_rcu() functions by
 * rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 * rcu_sync_exit().  Otherwise, set all state back to idle so that readers
 * can again use their fastpaths.
 */
-static void rcu_sync_func(struct rcu_head *rcu)
+static void rcu_sync_func(struct rcu_head *rhp)
 {
-        struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+        struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
        unsigned long flags;
        BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..3e3650e94ae6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 * read-side critical sections have completed. call_rcu_sched() assumes
 * that the read-side critical sections end on enabling of preemption
 * or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
+ * RCU read-side critical sections are delimited by:
- *  - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ *
- *  - anything that disables preemption.
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
 *
 *  These may be nested.
 *
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 * handler. This means that read-side critical sections in process
 * context must not be interrupted by softirqs. This interface is to be
 * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
+ * RCU read-side critical sections are delimited by:
- *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *
- *  OR
+ * - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context, OR
- *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- *  These may be nested.
+ *
+ * These may be nested.
 *
 * See the description of call_rcu() for more detailed information on
 * memory ordering guarantees.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-struct llc_stats {
+/*
-        unsigned long   nr_running;
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
-        unsigned long   load;
+ * soonest. For the purpose of speed we only consider the waking and previous
-        unsigned long   capacity;
+ * CPU.
-        int             has_capacity;
+ *
-};
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                      will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                        scheduling latency of the CPUs. This seems to work
+ *                        for the overloaded case.
+ */
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int prev_cpu, int sync)
 {
-        struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (idle_cpu(this_cpu))
+                return true;
-        if (!sds)
-                return false;
-        stats->nr_running       = READ_ONCE(sds->nr_running);
+        if (sync && cpu_rq(this_cpu)->nr_running == 1)
-        stats->load             = READ_ONCE(sds->load);
+                return true;
-        stats->capacity         = READ_ONCE(sds->capacity);
-        stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
-        return true;
+        return false;
 }
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
+                   int this_cpu, int prev_cpu, int sync)
 {
-        struct llc_stats prev_stats, this_stats;
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
-        if (!get_llc_stats(&prev_stats, prev_cpu) ||
+        this_eff_load = target_load(this_cpu, sd->wake_idx);
-            !get_llc_stats(&this_stats, this_cpu))
+        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
-                return false;
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current LLC.
-         */
        if (sync) {
                unsigned long current_load = task_h_load(current);
-                /* in this case load hits 0 and this LLC is considered 'idle' */
+                if (current_load > this_eff_load)
-                if (current_load > this_stats.load)
                        return true;
-                this_stats.load -= current_load;
+                this_eff_load -= current_load;
        }
-        /*
-         * The has_capacity stuff is not SMT aware, but by trying to balance
-         * the nr_running on both ends we try and fill the domain at equal
-         * rates, thereby first consuming cores before siblings.
-         */
-        /* if the old cache has capacity, stay there */
-        if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-                return false;
-        /* if this cache has capacity, come here */
-        if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
-                return true;
-        /*
-         * Check to see if we can move the load without causing too much
-         * imbalance.
-         */
        task_load = task_h_load(p);
-        this_eff_load = 100;
+        this_eff_load += task_load;
-        this_eff_load *= prev_stats.capacity;
+        if (sched_feat(WA_BIAS))
+                this_eff_load *= 100;
-        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+        this_eff_load *= capacity_of(prev_cpu);
-        prev_eff_load *= this_stats.capacity;
-        this_eff_load *= this_stats.load + task_load;
+        prev_eff_load -= task_load;
-        prev_eff_load *= prev_stats.load - task_load;
+        if (sched_feat(WA_BIAS))
+                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+        prev_eff_load *= capacity_of(this_cpu);
        return this_eff_load <= prev_eff_load;
 }
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                       int prev_cpu, int sync)
 {
        int this_cpu = smp_processor_id();
-        bool affine;
+        bool affine = false;
-        /*
+        if (sched_feat(WA_IDLE) && !affine)
-         * Default to no affine wakeups; wake_affine() should not effect a task
+                affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
-         * placement the load-balancer feels inclined to undo. The conservative
-         * option is therefore to not move tasks when they wake up.
-         */
-        affine = false;
-        /*
+        if (sched_feat(WA_WEIGHT) && !affine)
-         * If the wakeup is across cache domains, try to evaluate if movement
+                affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
-         * makes sense, otherwise rely on select_idle_siblings() to do
-         * placement inside the cache domain.
-         */
-        if (!cpus_share_cache(prev_cpu, this_cpu))
-                affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-        struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-        if (!shared)
-                return;
-        /*
-         * Since these are sums over groups they can contain some CPUs
-         * multiple times for the NUMA domains.
-         *
-         * Currently only wake_affine_llc() and find_busiest_group()
-         * uses these numbers, only the last is affected by this problem.
-         *
-         * XXX fix that.
-         */
-        WRITE_ONCE(shared->nr_running,  sds->total_running);
-        WRITE_ONCE(shared->load,        sds->total_load);
-        WRITE_ONCE(shared->capacity,    sds->total_capacity);
 }
 /**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
        int cpu, balance_cpu = -1;
        /*
+         * Ensure the balancing environment is consistent; can happen
+         * when the softirq triggers 'during' hotplug.
+         */
+        if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+                return 0;
+        /*
         * In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
 #include <linux/membarrier.h>
 #include <linux/tick.h>
 #include <linux/cpumask.h>
+#include <linux/atomic.h>
 #include "sched.h"      /* for cpu_rq(). */
@@ -26,21 +27,26 @@
 * except MEMBARRIER_CMD_QUERY.
 */
 #define MEMBARRIER_CMD_BITMASK  \
-        (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+        (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED       \
+        | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 static void ipi_mb(void *info)
 {
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
 {
        int cpu;
        bool fallback = false;
        cpumask_var_t tmpmask;
+        if (!(atomic_read(&current->mm->membarrier_state)
+                        & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+                return -EPERM;
        if (num_online_cpus() == 1)
-                return;
+                return 0;
        /*
         * Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
+        return 0;
+}
+static void membarrier_register_private_expedited(void)
+{
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        /*
+         * We need to consider threads belonging to different thread
+         * groups, which use the same mm. (CLONE_VM but not
+         * CLONE_THREAD).
+         */
+        if (atomic_read(&mm->membarrier_state)
+                        & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+                return;
+        atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+                        &mm->membarrier_state);
 }
 /**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
                        synchronize_sched();
                return 0;
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-                membarrier_private_expedited();
+                return membarrier_private_expedited();
+        case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+                membarrier_register_private_expedited();
                return 0;
        default:
                return -EINVAL;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bb3a38005b9c..0ae832e13b97 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
        return 0;
 }
-void __get_seccomp_filter(struct seccomp_filter *filter)
+static void __get_seccomp_filter(struct seccomp_filter *filter)
 {
        /* Reference count is bounded by the number of total processes. */
        refcount_inc(&filter->usage);
author	David S. Miller <davem@davemloft.net>	2017-10-22 08:36:53 -0400
committer	David S. Miller <davem@davemloft.net>	2017-10-22 08:39:14 -0400
commit	f8ddadc4db6c7b7029b6d0e0d9af24f74ad27ca2 (patch)
tree	0a6432aba336bae42313613f4c891bcfce02bd4e /kernel
parent	bdd091bab8c631bd2801af838e344fad34566410 (diff)
parent	b5ac3beb5a9f0ef0ea64cd85faf94c0dc4de0e42 (diff)