diff options
author | David S. Miller <davem@davemloft.net> | 2017-10-22 08:36:53 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-10-22 08:39:14 -0400 |
commit | f8ddadc4db6c7b7029b6d0e0d9af24f74ad27ca2 (patch) | |
tree | 0a6432aba336bae42313613f4c891bcfce02bd4e /kernel | |
parent | bdd091bab8c631bd2801af838e344fad34566410 (diff) | |
parent | b5ac3beb5a9f0ef0ea64cd85faf94c0dc4de0e42 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
There were quite a few overlapping sets of changes here.
Daniel's bug fix for off-by-ones in the new BPF branch instructions,
along with the added allowances for "data_end > ptr + x" forms
collided with the metadata additions.
Along with those three changes came veritifer test cases, which in
their final form I tried to group together properly. If I had just
trimmed GIT's conflict tags as-is, this would have split up the
meta tests unnecessarily.
In the socketmap code, a set of preemption disabling changes
overlapped with the rename of bpf_compute_data_end() to
bpf_compute_data_pointers().
Changes were made to the mv88e6060.c driver set addr method
which got removed in net-next.
The hyperv transport socket layer had a locking change in 'net'
which overlapped with a change of socket state macro usage
in 'net-next'.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 2 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 28 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 82 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/irq/cpuhotplug.c | 28 | ||||
-rw-r--r-- | kernel/irq/manage.c | 17 | ||||
-rw-r--r-- | kernel/livepatch/core.c | 60 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 48 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 2 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 9 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 18 | ||||
-rw-r--r-- | kernel/sched/fair.c | 140 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 34 | ||||
-rw-r--r-- | kernel/seccomp.c | 2 |
20 files changed, 308 insertions, 201 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 988c04c91e10..7c25426d3cf5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -102,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
102 | array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); | 102 | array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); |
103 | 103 | ||
104 | if (array_size >= U32_MAX - PAGE_SIZE || | 104 | if (array_size >= U32_MAX - PAGE_SIZE || |
105 | elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { | 105 | bpf_array_alloc_percpu(array)) { |
106 | bpf_map_area_free(array); | 106 | bpf_map_area_free(array); |
107 | return ERR_PTR(-ENOMEM); | 107 | return ERR_PTR(-ENOMEM); |
108 | } | 108 | } |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e5d3de7cff2e..ebdef54bf7df 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
@@ -72,7 +72,7 @@ static LIST_HEAD(dev_map_list); | |||
72 | 72 | ||
73 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) | 73 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
74 | { | 74 | { |
75 | return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); | 75 | return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
76 | } | 76 | } |
77 | 77 | ||
78 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | 78 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
@@ -81,6 +81,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
81 | int err = -EINVAL; | 81 | int err = -EINVAL; |
82 | u64 cost; | 82 | u64 cost; |
83 | 83 | ||
84 | if (!capable(CAP_NET_ADMIN)) | ||
85 | return ERR_PTR(-EPERM); | ||
86 | |||
84 | /* check sanity of attributes */ | 87 | /* check sanity of attributes */ |
85 | if (attr->max_entries == 0 || attr->key_size != 4 || | 88 | if (attr->max_entries == 0 || attr->key_size != 4 || |
86 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) | 89 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
@@ -114,8 +117,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
114 | err = -ENOMEM; | 117 | err = -ENOMEM; |
115 | 118 | ||
116 | /* A per cpu bitfield with a bit per possible net device */ | 119 | /* A per cpu bitfield with a bit per possible net device */ |
117 | dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), | 120 | dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
118 | __alignof__(unsigned long)); | 121 | __alignof__(unsigned long), |
122 | GFP_KERNEL | __GFP_NOWARN); | ||
119 | if (!dtab->flush_needed) | 123 | if (!dtab->flush_needed) |
120 | goto free_dtab; | 124 | goto free_dtab; |
121 | 125 | ||
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 919955236e63..e469e05c8e83 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -318,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
318 | */ | 318 | */ |
319 | goto free_htab; | 319 | goto free_htab; |
320 | 320 | ||
321 | if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) | ||
322 | /* make sure the size for pcpu_alloc() is reasonable */ | ||
323 | goto free_htab; | ||
324 | |||
325 | htab->elem_size = sizeof(struct htab_elem) + | 321 | htab->elem_size = sizeof(struct htab_elem) + |
326 | round_up(htab->map.key_size, 8); | 322 | round_up(htab->map.key_size, 8); |
327 | if (percpu) | 323 | if (percpu) |
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 86ec846f2d5e..eef843c3b419 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/workqueue.h> | 39 | #include <linux/workqueue.h> |
40 | #include <linux/list.h> | 40 | #include <linux/list.h> |
41 | #include <net/strparser.h> | 41 | #include <net/strparser.h> |
42 | #include <net/tcp.h> | ||
42 | 43 | ||
43 | #define SOCK_CREATE_FLAG_MASK \ | 44 | #define SOCK_CREATE_FLAG_MASK \ |
44 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) | 45 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) |
@@ -104,9 +105,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) | |||
104 | return SK_DROP; | 105 | return SK_DROP; |
105 | 106 | ||
106 | skb_orphan(skb); | 107 | skb_orphan(skb); |
108 | /* We need to ensure that BPF metadata for maps is also cleared | ||
109 | * when we orphan the skb so that we don't have the possibility | ||
110 | * to reference a stale map. | ||
111 | */ | ||
112 | TCP_SKB_CB(skb)->bpf.map = NULL; | ||
107 | skb->sk = psock->sock; | 113 | skb->sk = psock->sock; |
108 | bpf_compute_data_pointers(skb); | 114 | bpf_compute_data_pointers(skb); |
115 | preempt_disable(); | ||
109 | rc = (*prog->bpf_func)(skb, prog->insnsi); | 116 | rc = (*prog->bpf_func)(skb, prog->insnsi); |
117 | preempt_enable(); | ||
110 | skb->sk = NULL; | 118 | skb->sk = NULL; |
111 | 119 | ||
112 | return rc; | 120 | return rc; |
@@ -117,17 +125,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) | |||
117 | struct sock *sk; | 125 | struct sock *sk; |
118 | int rc; | 126 | int rc; |
119 | 127 | ||
120 | /* Because we use per cpu values to feed input from sock redirect | ||
121 | * in BPF program to do_sk_redirect_map() call we need to ensure we | ||
122 | * are not preempted. RCU read lock is not sufficient in this case | ||
123 | * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. | ||
124 | */ | ||
125 | preempt_disable(); | ||
126 | rc = smap_verdict_func(psock, skb); | 128 | rc = smap_verdict_func(psock, skb); |
127 | switch (rc) { | 129 | switch (rc) { |
128 | case SK_REDIRECT: | 130 | case SK_REDIRECT: |
129 | sk = do_sk_redirect_map(); | 131 | sk = do_sk_redirect_map(skb); |
130 | preempt_enable(); | ||
131 | if (likely(sk)) { | 132 | if (likely(sk)) { |
132 | struct smap_psock *peer = smap_psock_sk(sk); | 133 | struct smap_psock *peer = smap_psock_sk(sk); |
133 | 134 | ||
@@ -144,8 +145,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) | |||
144 | /* Fall through and free skb otherwise */ | 145 | /* Fall through and free skb otherwise */ |
145 | case SK_DROP: | 146 | case SK_DROP: |
146 | default: | 147 | default: |
147 | if (rc != SK_REDIRECT) | ||
148 | preempt_enable(); | ||
149 | kfree_skb(skb); | 148 | kfree_skb(skb); |
150 | } | 149 | } |
151 | } | 150 | } |
@@ -490,6 +489,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) | |||
490 | int err = -EINVAL; | 489 | int err = -EINVAL; |
491 | u64 cost; | 490 | u64 cost; |
492 | 491 | ||
492 | if (!capable(CAP_NET_ADMIN)) | ||
493 | return ERR_PTR(-EPERM); | ||
494 | |||
493 | /* check sanity of attributes */ | 495 | /* check sanity of attributes */ |
494 | if (attr->max_entries == 0 || attr->key_size != 4 || | 496 | if (attr->max_entries == 0 || attr->key_size != 4 || |
495 | attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) | 497 | attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) |
@@ -843,6 +845,12 @@ static int sock_map_update_elem(struct bpf_map *map, | |||
843 | return -EINVAL; | 845 | return -EINVAL; |
844 | } | 846 | } |
845 | 847 | ||
848 | if (skops.sk->sk_type != SOCK_STREAM || | ||
849 | skops.sk->sk_protocol != IPPROTO_TCP) { | ||
850 | fput(socket->file); | ||
851 | return -EOPNOTSUPP; | ||
852 | } | ||
853 | |||
846 | err = sock_map_ctx_update_elem(&skops, map, key, flags); | 854 | err = sock_map_ctx_update_elem(&skops, map, key, flags); |
847 | fput(socket->file); | 855 | fput(socket->file); |
848 | return err; | 856 | return err; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 545b8c45a578..d906775e12c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -1006,7 +1006,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
1006 | /* ctx accesses must be at a fixed offset, so that we can | 1006 | /* ctx accesses must be at a fixed offset, so that we can |
1007 | * determine what type of data were returned. | 1007 | * determine what type of data were returned. |
1008 | */ | 1008 | */ |
1009 | if (!tnum_is_const(reg->var_off)) { | 1009 | if (reg->off) { |
1010 | verbose(env, | ||
1011 | "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", | ||
1012 | regno, reg->off, off - reg->off); | ||
1013 | return -EACCES; | ||
1014 | } | ||
1015 | if (!tnum_is_const(reg->var_off) || reg->var_off.value) { | ||
1010 | char tn_buf[48]; | 1016 | char tn_buf[48]; |
1011 | 1017 | ||
1012 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); | 1018 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); |
@@ -1015,7 +1021,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
1015 | tn_buf, off, size); | 1021 | tn_buf, off, size); |
1016 | return -EACCES; | 1022 | return -EACCES; |
1017 | } | 1023 | } |
1018 | off += reg->var_off.value; | ||
1019 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); | 1024 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); |
1020 | if (!err && t == BPF_READ && value_regno >= 0) { | 1025 | if (!err && t == BPF_READ && value_regno >= 0) { |
1021 | /* ctx access returns either a scalar, or a | 1026 | /* ctx access returns either a scalar, or a |
@@ -2341,12 +2346,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2341 | 2346 | ||
2342 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, | 2347 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, |
2343 | struct bpf_reg_state *dst_reg, | 2348 | struct bpf_reg_state *dst_reg, |
2344 | enum bpf_reg_type type) | 2349 | enum bpf_reg_type type, |
2350 | bool range_right_open) | ||
2345 | { | 2351 | { |
2346 | struct bpf_reg_state *regs = state->regs, *reg; | 2352 | struct bpf_reg_state *regs = state->regs, *reg; |
2353 | u16 new_range; | ||
2347 | int i; | 2354 | int i; |
2348 | 2355 | ||
2349 | if (dst_reg->off < 0) | 2356 | if (dst_reg->off < 0 || |
2357 | (dst_reg->off == 0 && range_right_open)) | ||
2350 | /* This doesn't give us any range */ | 2358 | /* This doesn't give us any range */ |
2351 | return; | 2359 | return; |
2352 | 2360 | ||
@@ -2357,9 +2365,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2357 | */ | 2365 | */ |
2358 | return; | 2366 | return; |
2359 | 2367 | ||
2360 | /* LLVM can generate four kind of checks: | 2368 | new_range = dst_reg->off; |
2369 | if (range_right_open) | ||
2370 | new_range--; | ||
2371 | |||
2372 | /* Examples for register markings: | ||
2361 | * | 2373 | * |
2362 | * Type 1/2: | 2374 | * pkt_data in dst register: |
2363 | * | 2375 | * |
2364 | * r2 = r3; | 2376 | * r2 = r3; |
2365 | * r2 += 8; | 2377 | * r2 += 8; |
@@ -2376,7 +2388,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2376 | * r2=pkt(id=n,off=8,r=0) | 2388 | * r2=pkt(id=n,off=8,r=0) |
2377 | * r3=pkt(id=n,off=0,r=0) | 2389 | * r3=pkt(id=n,off=0,r=0) |
2378 | * | 2390 | * |
2379 | * Type 3/4: | 2391 | * pkt_data in src register: |
2380 | * | 2392 | * |
2381 | * r2 = r3; | 2393 | * r2 = r3; |
2382 | * r2 += 8; | 2394 | * r2 += 8; |
@@ -2394,7 +2406,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2394 | * r3=pkt(id=n,off=0,r=0) | 2406 | * r3=pkt(id=n,off=0,r=0) |
2395 | * | 2407 | * |
2396 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | 2408 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) |
2397 | * so that range of bytes [r3, r3 + 8) is safe to access. | 2409 | * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) |
2410 | * and [r3, r3 + 8-1) respectively is safe to access depending on | ||
2411 | * the check. | ||
2398 | */ | 2412 | */ |
2399 | 2413 | ||
2400 | /* If our ids match, then we must have the same max_value. And we | 2414 | /* If our ids match, then we must have the same max_value. And we |
@@ -2405,14 +2419,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2405 | for (i = 0; i < MAX_BPF_REG; i++) | 2419 | for (i = 0; i < MAX_BPF_REG; i++) |
2406 | if (regs[i].type == type && regs[i].id == dst_reg->id) | 2420 | if (regs[i].type == type && regs[i].id == dst_reg->id) |
2407 | /* keep the maximum range already checked */ | 2421 | /* keep the maximum range already checked */ |
2408 | regs[i].range = max_t(u16, regs[i].range, dst_reg->off); | 2422 | regs[i].range = max(regs[i].range, new_range); |
2409 | 2423 | ||
2410 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 2424 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
2411 | if (state->stack_slot_type[i] != STACK_SPILL) | 2425 | if (state->stack_slot_type[i] != STACK_SPILL) |
2412 | continue; | 2426 | continue; |
2413 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; | 2427 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; |
2414 | if (reg->type == type && reg->id == dst_reg->id) | 2428 | if (reg->type == type && reg->id == dst_reg->id) |
2415 | reg->range = max_t(u16, reg->range, dst_reg->off); | 2429 | reg->range = max_t(u16, reg->range, new_range); |
2416 | } | 2430 | } |
2417 | } | 2431 | } |
2418 | 2432 | ||
@@ -2776,39 +2790,71 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
2776 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | 2790 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && |
2777 | dst_reg->type == PTR_TO_PACKET && | 2791 | dst_reg->type == PTR_TO_PACKET && |
2778 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2792 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
2779 | find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); | 2793 | /* pkt_data' > pkt_end */ |
2794 | find_good_pkt_pointers(this_branch, dst_reg, | ||
2795 | PTR_TO_PACKET, false); | ||
2796 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | ||
2797 | dst_reg->type == PTR_TO_PACKET_END && | ||
2798 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
2799 | /* pkt_end > pkt_data' */ | ||
2800 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], | ||
2801 | PTR_TO_PACKET, true); | ||
2802 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && | ||
2803 | dst_reg->type == PTR_TO_PACKET && | ||
2804 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | ||
2805 | /* pkt_data' < pkt_end */ | ||
2806 | find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET, | ||
2807 | true); | ||
2780 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && | 2808 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && |
2809 | dst_reg->type == PTR_TO_PACKET_END && | ||
2810 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
2811 | /* pkt_end < pkt_data' */ | ||
2812 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], | ||
2813 | PTR_TO_PACKET, false); | ||
2814 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | ||
2781 | dst_reg->type == PTR_TO_PACKET && | 2815 | dst_reg->type == PTR_TO_PACKET && |
2782 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2816 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
2783 | find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); | 2817 | /* pkt_data' >= pkt_end */ |
2818 | find_good_pkt_pointers(this_branch, dst_reg, | ||
2819 | PTR_TO_PACKET, true); | ||
2784 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | 2820 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && |
2785 | dst_reg->type == PTR_TO_PACKET_END && | 2821 | dst_reg->type == PTR_TO_PACKET_END && |
2786 | regs[insn->src_reg].type == PTR_TO_PACKET) { | 2822 | regs[insn->src_reg].type == PTR_TO_PACKET) { |
2823 | /* pkt_end >= pkt_data' */ | ||
2787 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], | 2824 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], |
2788 | PTR_TO_PACKET); | 2825 | PTR_TO_PACKET, false); |
2826 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && | ||
2827 | dst_reg->type == PTR_TO_PACKET && | ||
2828 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | ||
2829 | /* pkt_data' <= pkt_end */ | ||
2830 | find_good_pkt_pointers(other_branch, dst_reg, | ||
2831 | PTR_TO_PACKET, false); | ||
2789 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && | 2832 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && |
2790 | dst_reg->type == PTR_TO_PACKET_END && | 2833 | dst_reg->type == PTR_TO_PACKET_END && |
2791 | regs[insn->src_reg].type == PTR_TO_PACKET) { | 2834 | regs[insn->src_reg].type == PTR_TO_PACKET) { |
2835 | /* pkt_end <= pkt_data' */ | ||
2792 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], | 2836 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], |
2793 | PTR_TO_PACKET); | 2837 | PTR_TO_PACKET, true); |
2794 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | 2838 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && |
2795 | dst_reg->type == PTR_TO_PACKET_META && | 2839 | dst_reg->type == PTR_TO_PACKET_META && |
2796 | reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { | 2840 | reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { |
2797 | find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); | 2841 | find_good_pkt_pointers(this_branch, dst_reg, |
2842 | PTR_TO_PACKET_META, false); | ||
2798 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && | 2843 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && |
2799 | dst_reg->type == PTR_TO_PACKET_META && | 2844 | dst_reg->type == PTR_TO_PACKET_META && |
2800 | reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { | 2845 | reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { |
2801 | find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); | 2846 | find_good_pkt_pointers(other_branch, dst_reg, |
2847 | PTR_TO_PACKET_META, false); | ||
2802 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | 2848 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && |
2803 | reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && | 2849 | reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && |
2804 | regs[insn->src_reg].type == PTR_TO_PACKET_META) { | 2850 | regs[insn->src_reg].type == PTR_TO_PACKET_META) { |
2805 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], | 2851 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], |
2806 | PTR_TO_PACKET_META); | 2852 | PTR_TO_PACKET_META, false); |
2807 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && | 2853 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && |
2808 | reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && | 2854 | reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && |
2809 | regs[insn->src_reg].type == PTR_TO_PACKET_META) { | 2855 | regs[insn->src_reg].type == PTR_TO_PACKET_META) { |
2810 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], | 2856 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], |
2811 | PTR_TO_PACKET_META); | 2857 | PTR_TO_PACKET_META, false); |
2812 | } else if (is_pointer_value(env, insn->dst_reg)) { | 2858 | } else if (is_pointer_value(env, insn->dst_reg)) { |
2813 | verbose(env, "R%d pointer comparison prohibited\n", | 2859 | verbose(env, "R%d pointer comparison prohibited\n", |
2814 | insn->dst_reg); | 2860 | insn->dst_reg); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 902149f05381..31ee304a5844 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) | |||
662 | /* | 662 | /* |
663 | * Do not update time when cgroup is not active | 663 | * Do not update time when cgroup is not active |
664 | */ | 664 | */ |
665 | if (cgrp == event->cgrp) | 665 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
666 | __update_cgrp_time(event->cgrp); | 666 | __update_cgrp_time(event->cgrp); |
667 | } | 667 | } |
668 | 668 | ||
@@ -8966,6 +8966,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) | |||
8966 | 8966 | ||
8967 | static void free_pmu_context(struct pmu *pmu) | 8967 | static void free_pmu_context(struct pmu *pmu) |
8968 | { | 8968 | { |
8969 | /* | ||
8970 | * Static contexts such as perf_sw_context have a global lifetime | ||
8971 | * and may be shared between different PMUs. Avoid freeing them | ||
8972 | * when a single PMU is going away. | ||
8973 | */ | ||
8974 | if (pmu->task_ctx_nr > perf_invalid_context) | ||
8975 | return; | ||
8976 | |||
8969 | mutex_lock(&pmus_lock); | 8977 | mutex_lock(&pmus_lock); |
8970 | free_percpu(pmu->pmu_cpu_context); | 8978 | free_percpu(pmu->pmu_cpu_context); |
8971 | mutex_unlock(&pmus_lock); | 8979 | mutex_unlock(&pmus_lock); |
diff --git a/kernel/exit.c b/kernel/exit.c index f2cd53e92147..f6cad39f35df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
1610 | if (!infop) | 1610 | if (!infop) |
1611 | return err; | 1611 | return err; |
1612 | 1612 | ||
1613 | if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) | ||
1614 | return -EFAULT; | ||
1615 | |||
1613 | user_access_begin(); | 1616 | user_access_begin(); |
1614 | unsafe_put_user(signo, &infop->si_signo, Efault); | 1617 | unsafe_put_user(signo, &infop->si_signo, Efault); |
1615 | unsafe_put_user(0, &infop->si_errno, Efault); | 1618 | unsafe_put_user(0, &infop->si_errno, Efault); |
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, | |||
1735 | if (!infop) | 1738 | if (!infop) |
1736 | return err; | 1739 | return err; |
1737 | 1740 | ||
1741 | if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) | ||
1742 | return -EFAULT; | ||
1743 | |||
1738 | user_access_begin(); | 1744 | user_access_begin(); |
1739 | unsafe_put_user(signo, &infop->si_signo, Efault); | 1745 | unsafe_put_user(signo, &infop->si_signo, Efault); |
1740 | unsafe_put_user(0, &infop->si_errno, Efault); | 1746 | unsafe_put_user(0, &infop->si_errno, Efault); |
diff --git a/kernel/fork.c b/kernel/fork.c index e702cb9ffbd8..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
215 | if (!s) | 215 | if (!s) |
216 | continue; | 216 | continue; |
217 | 217 | ||
218 | #ifdef CONFIG_DEBUG_KMEMLEAK | ||
219 | /* Clear stale pointers from reused stack. */ | ||
220 | memset(s->addr, 0, THREAD_SIZE); | ||
221 | #endif | ||
218 | tsk->stack_vm_area = s; | 222 | tsk->stack_vm_area = s; |
219 | return s->addr; | 223 | return s->addr; |
220 | } | 224 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..5a2ef92c2782 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) | |||
265 | irq_setup_affinity(desc); | 265 | irq_setup_affinity(desc); |
266 | break; | 266 | break; |
267 | case IRQ_STARTUP_MANAGED: | 267 | case IRQ_STARTUP_MANAGED: |
268 | irq_do_set_affinity(d, aff, false); | ||
268 | ret = __irq_startup(desc); | 269 | ret = __irq_startup(desc); |
269 | irq_set_affinity_locked(d, aff, false); | ||
270 | break; | 270 | break; |
271 | case IRQ_STARTUP_ABORT: | 271 | case IRQ_STARTUP_ABORT: |
272 | return 0; | 272 | return 0; |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c83d9f..9eb09aef0313 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c | |||
@@ -18,8 +18,34 @@ | |||
18 | static inline bool irq_needs_fixup(struct irq_data *d) | 18 | static inline bool irq_needs_fixup(struct irq_data *d) |
19 | { | 19 | { |
20 | const struct cpumask *m = irq_data_get_effective_affinity_mask(d); | 20 | const struct cpumask *m = irq_data_get_effective_affinity_mask(d); |
21 | unsigned int cpu = smp_processor_id(); | ||
21 | 22 | ||
22 | return cpumask_test_cpu(smp_processor_id(), m); | 23 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK |
24 | /* | ||
25 | * The cpumask_empty() check is a workaround for interrupt chips, | ||
26 | * which do not implement effective affinity, but the architecture has | ||
27 | * enabled the config switch. Use the general affinity mask instead. | ||
28 | */ | ||
29 | if (cpumask_empty(m)) | ||
30 | m = irq_data_get_affinity_mask(d); | ||
31 | |||
32 | /* | ||
33 | * Sanity check. If the mask is not empty when excluding the outgoing | ||
34 | * CPU then it must contain at least one online CPU. The outgoing CPU | ||
35 | * has been removed from the online mask already. | ||
36 | */ | ||
37 | if (cpumask_any_but(m, cpu) < nr_cpu_ids && | ||
38 | cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { | ||
39 | /* | ||
40 | * If this happens then there was a missed IRQ fixup at some | ||
41 | * point. Warn about it and enforce fixup. | ||
42 | */ | ||
43 | pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", | ||
44 | cpumask_pr_args(m), d->irq, cpu); | ||
45 | return true; | ||
46 | } | ||
47 | #endif | ||
48 | return cpumask_test_cpu(cpu, m); | ||
23 | } | 49 | } |
24 | 50 | ||
25 | static bool migrate_one_irq(struct irq_desc *desc) | 51 | static bool migrate_one_irq(struct irq_desc *desc) |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..4bff6a10ae8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); | 168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); |
169 | } | 169 | } |
170 | 170 | ||
171 | static void irq_validate_effective_affinity(struct irq_data *data) | ||
172 | { | ||
173 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
174 | const struct cpumask *m = irq_data_get_effective_affinity_mask(data); | ||
175 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
176 | |||
177 | if (!cpumask_empty(m)) | ||
178 | return; | ||
179 | pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", | ||
180 | chip->name, data->irq); | ||
181 | #endif | ||
182 | } | ||
183 | |||
171 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | 184 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, |
172 | bool force) | 185 | bool force) |
173 | { | 186 | { |
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
175 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 188 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
176 | int ret; | 189 | int ret; |
177 | 190 | ||
191 | if (!chip || !chip->irq_set_affinity) | ||
192 | return -EINVAL; | ||
193 | |||
178 | ret = chip->irq_set_affinity(data, mask, force); | 194 | ret = chip->irq_set_affinity(data, mask, force); |
179 | switch (ret) { | 195 | switch (ret) { |
180 | case IRQ_SET_MASK_OK: | 196 | case IRQ_SET_MASK_OK: |
181 | case IRQ_SET_MASK_OK_DONE: | 197 | case IRQ_SET_MASK_OK_DONE: |
182 | cpumask_copy(desc->irq_common_data.affinity, mask); | 198 | cpumask_copy(desc->irq_common_data.affinity, mask); |
183 | case IRQ_SET_MASK_OK_NOCOPY: | 199 | case IRQ_SET_MASK_OK_NOCOPY: |
200 | irq_validate_effective_affinity(data); | ||
184 | irq_set_thread_affinity(desc); | 201 | irq_set_thread_affinity(desc); |
185 | ret = 0; | 202 | ret = 0; |
186 | } | 203 | } |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..bf8c8fd72589 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch) | |||
830 | } | 830 | } |
831 | EXPORT_SYMBOL_GPL(klp_register_patch); | 831 | EXPORT_SYMBOL_GPL(klp_register_patch); |
832 | 832 | ||
833 | /* | ||
834 | * Remove parts of patches that touch a given kernel module. The list of | ||
835 | * patches processed might be limited. When limit is NULL, all patches | ||
836 | * will be handled. | ||
837 | */ | ||
838 | static void klp_cleanup_module_patches_limited(struct module *mod, | ||
839 | struct klp_patch *limit) | ||
840 | { | ||
841 | struct klp_patch *patch; | ||
842 | struct klp_object *obj; | ||
843 | |||
844 | list_for_each_entry(patch, &klp_patches, list) { | ||
845 | if (patch == limit) | ||
846 | break; | ||
847 | |||
848 | klp_for_each_object(patch, obj) { | ||
849 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | ||
850 | continue; | ||
851 | |||
852 | /* | ||
853 | * Only unpatch the module if the patch is enabled or | ||
854 | * is in transition. | ||
855 | */ | ||
856 | if (patch->enabled || patch == klp_transition_patch) { | ||
857 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | ||
858 | patch->mod->name, obj->mod->name); | ||
859 | klp_unpatch_object(obj); | ||
860 | } | ||
861 | |||
862 | klp_free_object_loaded(obj); | ||
863 | break; | ||
864 | } | ||
865 | } | ||
866 | } | ||
867 | |||
833 | int klp_module_coming(struct module *mod) | 868 | int klp_module_coming(struct module *mod) |
834 | { | 869 | { |
835 | int ret; | 870 | int ret; |
@@ -894,7 +929,7 @@ err: | |||
894 | pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", | 929 | pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", |
895 | patch->mod->name, obj->mod->name, obj->mod->name); | 930 | patch->mod->name, obj->mod->name, obj->mod->name); |
896 | mod->klp_alive = false; | 931 | mod->klp_alive = false; |
897 | klp_free_object_loaded(obj); | 932 | klp_cleanup_module_patches_limited(mod, patch); |
898 | mutex_unlock(&klp_mutex); | 933 | mutex_unlock(&klp_mutex); |
899 | 934 | ||
900 | return ret; | 935 | return ret; |
@@ -902,9 +937,6 @@ err: | |||
902 | 937 | ||
903 | void klp_module_going(struct module *mod) | 938 | void klp_module_going(struct module *mod) |
904 | { | 939 | { |
905 | struct klp_patch *patch; | ||
906 | struct klp_object *obj; | ||
907 | |||
908 | if (WARN_ON(mod->state != MODULE_STATE_GOING && | 940 | if (WARN_ON(mod->state != MODULE_STATE_GOING && |
909 | mod->state != MODULE_STATE_COMING)) | 941 | mod->state != MODULE_STATE_COMING)) |
910 | return; | 942 | return; |
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod) | |||
917 | */ | 949 | */ |
918 | mod->klp_alive = false; | 950 | mod->klp_alive = false; |
919 | 951 | ||
920 | list_for_each_entry(patch, &klp_patches, list) { | 952 | klp_cleanup_module_patches_limited(mod, NULL); |
921 | klp_for_each_object(patch, obj) { | ||
922 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | ||
923 | continue; | ||
924 | |||
925 | /* | ||
926 | * Only unpatch the module if the patch is enabled or | ||
927 | * is in transition. | ||
928 | */ | ||
929 | if (patch->enabled || patch == klp_transition_patch) { | ||
930 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | ||
931 | patch->mod->name, obj->mod->name); | ||
932 | klp_unpatch_object(obj); | ||
933 | } | ||
934 | |||
935 | klp_free_object_loaded(obj); | ||
936 | break; | ||
937 | } | ||
938 | } | ||
939 | 953 | ||
940 | mutex_unlock(&klp_mutex); | 954 | mutex_unlock(&klp_mutex); |
941 | } | 955 | } |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 44c8d0d17170..e36e652d996f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1873 | struct held_lock *next, int distance, struct stack_trace *trace, | 1873 | struct held_lock *next, int distance, struct stack_trace *trace, |
1874 | int (*save)(struct stack_trace *trace)) | 1874 | int (*save)(struct stack_trace *trace)) |
1875 | { | 1875 | { |
1876 | struct lock_list *uninitialized_var(target_entry); | ||
1876 | struct lock_list *entry; | 1877 | struct lock_list *entry; |
1877 | int ret; | ||
1878 | struct lock_list this; | 1878 | struct lock_list this; |
1879 | struct lock_list *uninitialized_var(target_entry); | 1879 | int ret; |
1880 | 1880 | ||
1881 | /* | 1881 | /* |
1882 | * Prove that the new <prev> -> <next> dependency would not | 1882 | * Prove that the new <prev> -> <next> dependency would not |
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1890 | this.class = hlock_class(next); | 1890 | this.class = hlock_class(next); |
1891 | this.parent = NULL; | 1891 | this.parent = NULL; |
1892 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); | 1892 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); |
1893 | if (unlikely(!ret)) | 1893 | if (unlikely(!ret)) { |
1894 | if (!trace->entries) { | ||
1895 | /* | ||
1896 | * If @save fails here, the printing might trigger | ||
1897 | * a WARN but because of the !nr_entries it should | ||
1898 | * not do bad things. | ||
1899 | */ | ||
1900 | save(trace); | ||
1901 | } | ||
1894 | return print_circular_bug(&this, target_entry, next, prev, trace); | 1902 | return print_circular_bug(&this, target_entry, next, prev, trace); |
1903 | } | ||
1895 | else if (unlikely(ret < 0)) | 1904 | else if (unlikely(ret < 0)) |
1896 | return print_bfs_bug(ret); | 1905 | return print_bfs_bug(ret); |
1897 | 1906 | ||
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1938 | return print_bfs_bug(ret); | 1947 | return print_bfs_bug(ret); |
1939 | 1948 | ||
1940 | 1949 | ||
1941 | if (save && !save(trace)) | 1950 | if (!trace->entries && !save(trace)) |
1942 | return 0; | 1951 | return 0; |
1943 | 1952 | ||
1944 | /* | 1953 | /* |
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1958 | if (!ret) | 1967 | if (!ret) |
1959 | return 0; | 1968 | return 0; |
1960 | 1969 | ||
1961 | /* | ||
1962 | * Debugging printouts: | ||
1963 | */ | ||
1964 | if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { | ||
1965 | graph_unlock(); | ||
1966 | printk("\n new dependency: "); | ||
1967 | print_lock_name(hlock_class(prev)); | ||
1968 | printk(KERN_CONT " => "); | ||
1969 | print_lock_name(hlock_class(next)); | ||
1970 | printk(KERN_CONT "\n"); | ||
1971 | dump_stack(); | ||
1972 | if (!graph_lock()) | ||
1973 | return 0; | ||
1974 | } | ||
1975 | return 2; | 1970 | return 2; |
1976 | } | 1971 | } |
1977 | 1972 | ||
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
1986 | { | 1981 | { |
1987 | int depth = curr->lockdep_depth; | 1982 | int depth = curr->lockdep_depth; |
1988 | struct held_lock *hlock; | 1983 | struct held_lock *hlock; |
1989 | struct stack_trace trace; | 1984 | struct stack_trace trace = { |
1990 | int (*save)(struct stack_trace *trace) = save_trace; | 1985 | .nr_entries = 0, |
1986 | .max_entries = 0, | ||
1987 | .entries = NULL, | ||
1988 | .skip = 0, | ||
1989 | }; | ||
1991 | 1990 | ||
1992 | /* | 1991 | /* |
1993 | * Debugging checks. | 1992 | * Debugging checks. |
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
2018 | */ | 2017 | */ |
2019 | if (hlock->read != 2 && hlock->check) { | 2018 | if (hlock->read != 2 && hlock->check) { |
2020 | int ret = check_prev_add(curr, hlock, next, | 2019 | int ret = check_prev_add(curr, hlock, next, |
2021 | distance, &trace, save); | 2020 | distance, &trace, save_trace); |
2022 | if (!ret) | 2021 | if (!ret) |
2023 | return 0; | 2022 | return 0; |
2024 | 2023 | ||
2025 | /* | 2024 | /* |
2026 | * Stop saving stack_trace if save_trace() was | ||
2027 | * called at least once: | ||
2028 | */ | ||
2029 | if (save && ret == 2) | ||
2030 | save = NULL; | ||
2031 | |||
2032 | /* | ||
2033 | * Stop after the first non-trylock entry, | 2025 | * Stop after the first non-trylock entry, |
2034 | * as non-trylock entries have added their | 2026 | * as non-trylock entries have added their |
2035 | * own direct dependencies already, so this | 2027 | * own direct dependencies already, so this |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | |||
854 | /** | 854 | /** |
855 | * call_srcu() - Queue a callback for invocation after an SRCU grace period | 855 | * call_srcu() - Queue a callback for invocation after an SRCU grace period |
856 | * @sp: srcu_struct in queue the callback | 856 | * @sp: srcu_struct in queue the callback |
857 | * @head: structure to be used for queueing the SRCU callback. | 857 | * @rhp: structure to be used for queueing the SRCU callback. |
858 | * @func: function to be invoked after the SRCU grace period | 858 | * @func: function to be invoked after the SRCU grace period |
859 | * | 859 | * |
860 | * The callback function will be invoked some time after a full SRCU | 860 | * The callback function will be invoked some time after a full SRCU |
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c | |||
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | |||
85 | } | 85 | } |
86 | 86 | ||
87 | /** | 87 | /** |
88 | * rcu_sync_enter_start - Force readers onto slow path for multiple updates | ||
89 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
90 | * | ||
88 | * Must be called after rcu_sync_init() and before first use. | 91 | * Must be called after rcu_sync_init() and before first use. |
89 | * | 92 | * |
90 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() | 93 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() |
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) | |||
142 | 145 | ||
143 | /** | 146 | /** |
144 | * rcu_sync_func() - Callback function managing reader access to fastpath | 147 | * rcu_sync_func() - Callback function managing reader access to fastpath |
145 | * @rsp: Pointer to rcu_sync structure to use for synchronization | 148 | * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization |
146 | * | 149 | * |
147 | * This function is passed to one of the call_rcu() functions by | 150 | * This function is passed to one of the call_rcu() functions by |
148 | * rcu_sync_exit(), so that it is invoked after a grace period following the | 151 | * rcu_sync_exit(), so that it is invoked after a grace period following the |
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) | |||
158 | * rcu_sync_exit(). Otherwise, set all state back to idle so that readers | 161 | * rcu_sync_exit(). Otherwise, set all state back to idle so that readers |
159 | * can again use their fastpaths. | 162 | * can again use their fastpaths. |
160 | */ | 163 | */ |
161 | static void rcu_sync_func(struct rcu_head *rcu) | 164 | static void rcu_sync_func(struct rcu_head *rhp) |
162 | { | 165 | { |
163 | struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); | 166 | struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); |
164 | unsigned long flags; | 167 | unsigned long flags; |
165 | 168 | ||
166 | BUG_ON(rsp->gp_state != GP_PASSED); | 169 | BUG_ON(rsp->gp_state != GP_PASSED); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3097 | * read-side critical sections have completed. call_rcu_sched() assumes | 3097 | * read-side critical sections have completed. call_rcu_sched() assumes |
3098 | * that the read-side critical sections end on enabling of preemption | 3098 | * that the read-side critical sections end on enabling of preemption |
3099 | * or on voluntary preemption. | 3099 | * or on voluntary preemption. |
3100 | * RCU read-side critical sections are delimited by : | 3100 | * RCU read-side critical sections are delimited by: |
3101 | * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR | 3101 | * |
3102 | * - anything that disables preemption. | 3102 | * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR |
3103 | * - anything that disables preemption. | ||
3103 | * | 3104 | * |
3104 | * These may be nested. | 3105 | * These may be nested. |
3105 | * | 3106 | * |
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
3124 | * handler. This means that read-side critical sections in process | 3125 | * handler. This means that read-side critical sections in process |
3125 | * context must not be interrupted by softirqs. This interface is to be | 3126 | * context must not be interrupted by softirqs. This interface is to be |
3126 | * used when most of the read-side critical sections are in softirq context. | 3127 | * used when most of the read-side critical sections are in softirq context. |
3127 | * RCU read-side critical sections are delimited by : | 3128 | * RCU read-side critical sections are delimited by: |
3128 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. | 3129 | * |
3129 | * OR | 3130 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR |
3130 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. | 3131 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. |
3131 | * These may be nested. | 3132 | * |
3133 | * These may be nested. | ||
3132 | * | 3134 | * |
3133 | * See the description of call_rcu() for more detailed information on | 3135 | * See the description of call_rcu() for more detailed information on |
3134 | * memory ordering guarantees. | 3136 | * memory ordering guarantees. |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p) | |||
5356 | return 1; | 5356 | return 1; |
5357 | } | 5357 | } |
5358 | 5358 | ||
5359 | struct llc_stats { | 5359 | /* |
5360 | unsigned long nr_running; | 5360 | * The purpose of wake_affine() is to quickly determine on which CPU we can run |
5361 | unsigned long load; | 5361 | * soonest. For the purpose of speed we only consider the waking and previous |
5362 | unsigned long capacity; | 5362 | * CPU. |
5363 | int has_capacity; | 5363 | * |
5364 | }; | 5364 | * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or |
5365 | * will be) idle. | ||
5366 | * | ||
5367 | * wake_affine_weight() - considers the weight to reflect the average | ||
5368 | * scheduling latency of the CPUs. This seems to work | ||
5369 | * for the overloaded case. | ||
5370 | */ | ||
5365 | 5371 | ||
5366 | static bool get_llc_stats(struct llc_stats *stats, int cpu) | 5372 | static bool |
5373 | wake_affine_idle(struct sched_domain *sd, struct task_struct *p, | ||
5374 | int this_cpu, int prev_cpu, int sync) | ||
5367 | { | 5375 | { |
5368 | struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 5376 | if (idle_cpu(this_cpu)) |
5369 | 5377 | return true; | |
5370 | if (!sds) | ||
5371 | return false; | ||
5372 | 5378 | ||
5373 | stats->nr_running = READ_ONCE(sds->nr_running); | 5379 | if (sync && cpu_rq(this_cpu)->nr_running == 1) |
5374 | stats->load = READ_ONCE(sds->load); | 5380 | return true; |
5375 | stats->capacity = READ_ONCE(sds->capacity); | ||
5376 | stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); | ||
5377 | 5381 | ||
5378 | return true; | 5382 | return false; |
5379 | } | 5383 | } |
5380 | 5384 | ||
5381 | /* | ||
5382 | * Can a task be moved from prev_cpu to this_cpu without causing a load | ||
5383 | * imbalance that would trigger the load balancer? | ||
5384 | * | ||
5385 | * Since we're running on 'stale' values, we might in fact create an imbalance | ||
5386 | * but recomputing these values is expensive, as that'd mean iteration 2 cache | ||
5387 | * domains worth of CPUs. | ||
5388 | */ | ||
5389 | static bool | 5385 | static bool |
5390 | wake_affine_llc(struct sched_domain *sd, struct task_struct *p, | 5386 | wake_affine_weight(struct sched_domain *sd, struct task_struct *p, |
5391 | int this_cpu, int prev_cpu, int sync) | 5387 | int this_cpu, int prev_cpu, int sync) |
5392 | { | 5388 | { |
5393 | struct llc_stats prev_stats, this_stats; | ||
5394 | s64 this_eff_load, prev_eff_load; | 5389 | s64 this_eff_load, prev_eff_load; |
5395 | unsigned long task_load; | 5390 | unsigned long task_load; |
5396 | 5391 | ||
5397 | if (!get_llc_stats(&prev_stats, prev_cpu) || | 5392 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5398 | !get_llc_stats(&this_stats, this_cpu)) | 5393 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); |
5399 | return false; | ||
5400 | 5394 | ||
5401 | /* | ||
5402 | * If sync wakeup then subtract the (maximum possible) | ||
5403 | * effect of the currently running task from the load | ||
5404 | * of the current LLC. | ||
5405 | */ | ||
5406 | if (sync) { | 5395 | if (sync) { |
5407 | unsigned long current_load = task_h_load(current); | 5396 | unsigned long current_load = task_h_load(current); |
5408 | 5397 | ||
5409 | /* in this case load hits 0 and this LLC is considered 'idle' */ | 5398 | if (current_load > this_eff_load) |
5410 | if (current_load > this_stats.load) | ||
5411 | return true; | 5399 | return true; |
5412 | 5400 | ||
5413 | this_stats.load -= current_load; | 5401 | this_eff_load -= current_load; |
5414 | } | 5402 | } |
5415 | 5403 | ||
5416 | /* | ||
5417 | * The has_capacity stuff is not SMT aware, but by trying to balance | ||
5418 | * the nr_running on both ends we try and fill the domain at equal | ||
5419 | * rates, thereby first consuming cores before siblings. | ||
5420 | */ | ||
5421 | |||
5422 | /* if the old cache has capacity, stay there */ | ||
5423 | if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) | ||
5424 | return false; | ||
5425 | |||
5426 | /* if this cache has capacity, come here */ | ||
5427 | if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) | ||
5428 | return true; | ||
5429 | |||
5430 | /* | ||
5431 | * Check to see if we can move the load without causing too much | ||
5432 | * imbalance. | ||
5433 | */ | ||
5434 | task_load = task_h_load(p); | 5404 | task_load = task_h_load(p); |
5435 | 5405 | ||
5436 | this_eff_load = 100; | 5406 | this_eff_load += task_load; |
5437 | this_eff_load *= prev_stats.capacity; | 5407 | if (sched_feat(WA_BIAS)) |
5438 | 5408 | this_eff_load *= 100; | |
5439 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 5409 | this_eff_load *= capacity_of(prev_cpu); |
5440 | prev_eff_load *= this_stats.capacity; | ||
5441 | 5410 | ||
5442 | this_eff_load *= this_stats.load + task_load; | 5411 | prev_eff_load -= task_load; |
5443 | prev_eff_load *= prev_stats.load - task_load; | 5412 | if (sched_feat(WA_BIAS)) |
5413 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | ||
5414 | prev_eff_load *= capacity_of(this_cpu); | ||
5444 | 5415 | ||
5445 | return this_eff_load <= prev_eff_load; | 5416 | return this_eff_load <= prev_eff_load; |
5446 | } | 5417 | } |
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5449 | int prev_cpu, int sync) | 5420 | int prev_cpu, int sync) |
5450 | { | 5421 | { |
5451 | int this_cpu = smp_processor_id(); | 5422 | int this_cpu = smp_processor_id(); |
5452 | bool affine; | 5423 | bool affine = false; |
5453 | 5424 | ||
5454 | /* | 5425 | if (sched_feat(WA_IDLE) && !affine) |
5455 | * Default to no affine wakeups; wake_affine() should not effect a task | 5426 | affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); |
5456 | * placement the load-balancer feels inclined to undo. The conservative | ||
5457 | * option is therefore to not move tasks when they wake up. | ||
5458 | */ | ||
5459 | affine = false; | ||
5460 | 5427 | ||
5461 | /* | 5428 | if (sched_feat(WA_WEIGHT) && !affine) |
5462 | * If the wakeup is across cache domains, try to evaluate if movement | 5429 | affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); |
5463 | * makes sense, otherwise rely on select_idle_siblings() to do | ||
5464 | * placement inside the cache domain. | ||
5465 | */ | ||
5466 | if (!cpus_share_cache(prev_cpu, this_cpu)) | ||
5467 | affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); | ||
5468 | 5430 | ||
5469 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | 5431 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5470 | if (affine) { | 5432 | if (affine) { |
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) | |||
7600 | */ | 7562 | */ |
7601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) | 7563 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
7602 | { | 7564 | { |
7603 | struct sched_domain_shared *shared = env->sd->shared; | ||
7604 | struct sched_domain *child = env->sd->child; | 7565 | struct sched_domain *child = env->sd->child; |
7605 | struct sched_group *sg = env->sd->groups; | 7566 | struct sched_group *sg = env->sd->groups; |
7606 | struct sg_lb_stats *local = &sds->local_stat; | 7567 | struct sg_lb_stats *local = &sds->local_stat; |
@@ -7672,22 +7633,6 @@ next_group: | |||
7672 | if (env->dst_rq->rd->overload != overload) | 7633 | if (env->dst_rq->rd->overload != overload) |
7673 | env->dst_rq->rd->overload = overload; | 7634 | env->dst_rq->rd->overload = overload; |
7674 | } | 7635 | } |
7675 | |||
7676 | if (!shared) | ||
7677 | return; | ||
7678 | |||
7679 | /* | ||
7680 | * Since these are sums over groups they can contain some CPUs | ||
7681 | * multiple times for the NUMA domains. | ||
7682 | * | ||
7683 | * Currently only wake_affine_llc() and find_busiest_group() | ||
7684 | * uses these numbers, only the last is affected by this problem. | ||
7685 | * | ||
7686 | * XXX fix that. | ||
7687 | */ | ||
7688 | WRITE_ONCE(shared->nr_running, sds->total_running); | ||
7689 | WRITE_ONCE(shared->load, sds->total_load); | ||
7690 | WRITE_ONCE(shared->capacity, sds->total_capacity); | ||
7691 | } | 7636 | } |
7692 | 7637 | ||
7693 | /** | 7638 | /** |
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env) | |||
8098 | int cpu, balance_cpu = -1; | 8043 | int cpu, balance_cpu = -1; |
8099 | 8044 | ||
8100 | /* | 8045 | /* |
8046 | * Ensure the balancing environment is consistent; can happen | ||
8047 | * when the softirq triggers 'during' hotplug. | ||
8048 | */ | ||
8049 | if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) | ||
8050 | return 0; | ||
8051 | |||
8052 | /* | ||
8101 | * In the newly idle case, we will allow all the cpu's | 8053 | * In the newly idle case, we will allow all the cpu's |
8102 | * to do the newly idle load balance. | 8054 | * to do the newly idle load balance. |
8103 | */ | 8055 | */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) | |||
81 | SCHED_FEAT(LB_MIN, false) | 81 | SCHED_FEAT(LB_MIN, false) |
82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) |
83 | 83 | ||
84 | SCHED_FEAT(WA_IDLE, true) | ||
85 | SCHED_FEAT(WA_WEIGHT, true) | ||
86 | SCHED_FEAT(WA_BIAS, true) | ||
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/membarrier.h> | 18 | #include <linux/membarrier.h> |
19 | #include <linux/tick.h> | 19 | #include <linux/tick.h> |
20 | #include <linux/cpumask.h> | 20 | #include <linux/cpumask.h> |
21 | #include <linux/atomic.h> | ||
21 | 22 | ||
22 | #include "sched.h" /* for cpu_rq(). */ | 23 | #include "sched.h" /* for cpu_rq(). */ |
23 | 24 | ||
@@ -26,21 +27,26 @@ | |||
26 | * except MEMBARRIER_CMD_QUERY. | 27 | * except MEMBARRIER_CMD_QUERY. |
27 | */ | 28 | */ |
28 | #define MEMBARRIER_CMD_BITMASK \ | 29 | #define MEMBARRIER_CMD_BITMASK \ |
29 | (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) | 30 | (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
31 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) | ||
30 | 32 | ||
31 | static void ipi_mb(void *info) | 33 | static void ipi_mb(void *info) |
32 | { | 34 | { |
33 | smp_mb(); /* IPIs should be serializing but paranoid. */ | 35 | smp_mb(); /* IPIs should be serializing but paranoid. */ |
34 | } | 36 | } |
35 | 37 | ||
36 | static void membarrier_private_expedited(void) | 38 | static int membarrier_private_expedited(void) |
37 | { | 39 | { |
38 | int cpu; | 40 | int cpu; |
39 | bool fallback = false; | 41 | bool fallback = false; |
40 | cpumask_var_t tmpmask; | 42 | cpumask_var_t tmpmask; |
41 | 43 | ||
44 | if (!(atomic_read(¤t->mm->membarrier_state) | ||
45 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) | ||
46 | return -EPERM; | ||
47 | |||
42 | if (num_online_cpus() == 1) | 48 | if (num_online_cpus() == 1) |
43 | return; | 49 | return 0; |
44 | 50 | ||
45 | /* | 51 | /* |
46 | * Matches memory barriers around rq->curr modification in | 52 | * Matches memory barriers around rq->curr modification in |
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void) | |||
94 | * rq->curr modification in scheduler. | 100 | * rq->curr modification in scheduler. |
95 | */ | 101 | */ |
96 | smp_mb(); /* exit from system call is not a mb */ | 102 | smp_mb(); /* exit from system call is not a mb */ |
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static void membarrier_register_private_expedited(void) | ||
107 | { | ||
108 | struct task_struct *p = current; | ||
109 | struct mm_struct *mm = p->mm; | ||
110 | |||
111 | /* | ||
112 | * We need to consider threads belonging to different thread | ||
113 | * groups, which use the same mm. (CLONE_VM but not | ||
114 | * CLONE_THREAD). | ||
115 | */ | ||
116 | if (atomic_read(&mm->membarrier_state) | ||
117 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) | ||
118 | return; | ||
119 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, | ||
120 | &mm->membarrier_state); | ||
97 | } | 121 | } |
98 | 122 | ||
99 | /** | 123 | /** |
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) | |||
144 | synchronize_sched(); | 168 | synchronize_sched(); |
145 | return 0; | 169 | return 0; |
146 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: | 170 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: |
147 | membarrier_private_expedited(); | 171 | return membarrier_private_expedited(); |
172 | case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: | ||
173 | membarrier_register_private_expedited(); | ||
148 | return 0; | 174 | return 0; |
149 | default: | 175 | default: |
150 | return -EINVAL; | 176 | return -EINVAL; |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb3a38005b9c..0ae832e13b97 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags, | |||
473 | return 0; | 473 | return 0; |
474 | } | 474 | } |
475 | 475 | ||
476 | void __get_seccomp_filter(struct seccomp_filter *filter) | 476 | static void __get_seccomp_filter(struct seccomp_filter *filter) |
477 | { | 477 | { |
478 | /* Reference count is bounded by the number of total processes. */ | 478 | /* Reference count is bounded by the number of total processes. */ |
479 | refcount_inc(&filter->usage); | 479 | refcount_inc(&filter->usage); |