aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-10-22 08:36:53 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-22 08:39:14 -0400
commitf8ddadc4db6c7b7029b6d0e0d9af24f74ad27ca2 (patch)
tree0a6432aba336bae42313613f4c891bcfce02bd4e /kernel
parentbdd091bab8c631bd2801af838e344fad34566410 (diff)
parentb5ac3beb5a9f0ef0ea64cd85faf94c0dc4de0e42 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
There were quite a few overlapping sets of changes here. Daniel's bug fix for off-by-ones in the new BPF branch instructions, along with the added allowances for "data_end > ptr + x" forms collided with the metadata additions. Along with those three changes came veritifer test cases, which in their final form I tried to group together properly. If I had just trimmed GIT's conflict tags as-is, this would have split up the meta tests unnecessarily. In the socketmap code, a set of preemption disabling changes overlapped with the rename of bpf_compute_data_end() to bpf_compute_data_pointers(). Changes were made to the mv88e6060.c driver set addr method which got removed in net-next. The hyperv transport socket layer had a locking change in 'net' which overlapped with a change of socket state macro usage in 'net-next'. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/sockmap.c28
-rw-r--r--kernel/bpf/verifier.c82
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/cpuhotplug.c28
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/livepatch/core.c60
-rw-r--r--kernel/locking/lockdep.c48
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/sync.c9
-rw-r--r--kernel/rcu/tree.c18
-rw-r--r--kernel/sched/fair.c140
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/membarrier.c34
-rw-r--r--kernel/seccomp.c2
20 files changed, 308 insertions, 201 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 988c04c91e10..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -102,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
102 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); 102 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
103 103
104 if (array_size >= U32_MAX - PAGE_SIZE || 104 if (array_size >= U32_MAX - PAGE_SIZE ||
105 elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { 105 bpf_array_alloc_percpu(array)) {
106 bpf_map_area_free(array); 106 bpf_map_area_free(array);
107 return ERR_PTR(-ENOMEM); 107 return ERR_PTR(-ENOMEM);
108 } 108 }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e5d3de7cff2e..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -72,7 +72,7 @@ static LIST_HEAD(dev_map_list);
72 72
73static u64 dev_map_bitmap_size(const union bpf_attr *attr) 73static u64 dev_map_bitmap_size(const union bpf_attr *attr)
74{ 74{
75 return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); 75 return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
76} 76}
77 77
78static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 78static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -81,6 +81,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
81 int err = -EINVAL; 81 int err = -EINVAL;
82 u64 cost; 82 u64 cost;
83 83
84 if (!capable(CAP_NET_ADMIN))
85 return ERR_PTR(-EPERM);
86
84 /* check sanity of attributes */ 87 /* check sanity of attributes */
85 if (attr->max_entries == 0 || attr->key_size != 4 || 88 if (attr->max_entries == 0 || attr->key_size != 4 ||
86 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) 89 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
@@ -114,8 +117,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
114 err = -ENOMEM; 117 err = -ENOMEM;
115 118
116 /* A per cpu bitfield with a bit per possible net device */ 119 /* A per cpu bitfield with a bit per possible net device */
117 dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), 120 dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
118 __alignof__(unsigned long)); 121 __alignof__(unsigned long),
122 GFP_KERNEL | __GFP_NOWARN);
119 if (!dtab->flush_needed) 123 if (!dtab->flush_needed)
120 goto free_dtab; 124 goto free_dtab;
121 125
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 919955236e63..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -318,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
318 */ 318 */
319 goto free_htab; 319 goto free_htab;
320 320
321 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
322 /* make sure the size for pcpu_alloc() is reasonable */
323 goto free_htab;
324
325 htab->elem_size = sizeof(struct htab_elem) + 321 htab->elem_size = sizeof(struct htab_elem) +
326 round_up(htab->map.key_size, 8); 322 round_up(htab->map.key_size, 8);
327 if (percpu) 323 if (percpu)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 86ec846f2d5e..eef843c3b419 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -39,6 +39,7 @@
39#include <linux/workqueue.h> 39#include <linux/workqueue.h>
40#include <linux/list.h> 40#include <linux/list.h>
41#include <net/strparser.h> 41#include <net/strparser.h>
42#include <net/tcp.h>
42 43
43#define SOCK_CREATE_FLAG_MASK \ 44#define SOCK_CREATE_FLAG_MASK \
44 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 45 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -104,9 +105,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
104 return SK_DROP; 105 return SK_DROP;
105 106
106 skb_orphan(skb); 107 skb_orphan(skb);
108 /* We need to ensure that BPF metadata for maps is also cleared
109 * when we orphan the skb so that we don't have the possibility
110 * to reference a stale map.
111 */
112 TCP_SKB_CB(skb)->bpf.map = NULL;
107 skb->sk = psock->sock; 113 skb->sk = psock->sock;
108 bpf_compute_data_pointers(skb); 114 bpf_compute_data_pointers(skb);
115 preempt_disable();
109 rc = (*prog->bpf_func)(skb, prog->insnsi); 116 rc = (*prog->bpf_func)(skb, prog->insnsi);
117 preempt_enable();
110 skb->sk = NULL; 118 skb->sk = NULL;
111 119
112 return rc; 120 return rc;
@@ -117,17 +125,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
117 struct sock *sk; 125 struct sock *sk;
118 int rc; 126 int rc;
119 127
120 /* Because we use per cpu values to feed input from sock redirect
121 * in BPF program to do_sk_redirect_map() call we need to ensure we
122 * are not preempted. RCU read lock is not sufficient in this case
123 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
124 */
125 preempt_disable();
126 rc = smap_verdict_func(psock, skb); 128 rc = smap_verdict_func(psock, skb);
127 switch (rc) { 129 switch (rc) {
128 case SK_REDIRECT: 130 case SK_REDIRECT:
129 sk = do_sk_redirect_map(); 131 sk = do_sk_redirect_map(skb);
130 preempt_enable();
131 if (likely(sk)) { 132 if (likely(sk)) {
132 struct smap_psock *peer = smap_psock_sk(sk); 133 struct smap_psock *peer = smap_psock_sk(sk);
133 134
@@ -144,8 +145,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
144 /* Fall through and free skb otherwise */ 145 /* Fall through and free skb otherwise */
145 case SK_DROP: 146 case SK_DROP:
146 default: 147 default:
147 if (rc != SK_REDIRECT)
148 preempt_enable();
149 kfree_skb(skb); 148 kfree_skb(skb);
150 } 149 }
151} 150}
@@ -490,6 +489,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
490 int err = -EINVAL; 489 int err = -EINVAL;
491 u64 cost; 490 u64 cost;
492 491
492 if (!capable(CAP_NET_ADMIN))
493 return ERR_PTR(-EPERM);
494
493 /* check sanity of attributes */ 495 /* check sanity of attributes */
494 if (attr->max_entries == 0 || attr->key_size != 4 || 496 if (attr->max_entries == 0 || attr->key_size != 4 ||
495 attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) 497 attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
@@ -843,6 +845,12 @@ static int sock_map_update_elem(struct bpf_map *map,
843 return -EINVAL; 845 return -EINVAL;
844 } 846 }
845 847
848 if (skops.sk->sk_type != SOCK_STREAM ||
849 skops.sk->sk_protocol != IPPROTO_TCP) {
850 fput(socket->file);
851 return -EOPNOTSUPP;
852 }
853
846 err = sock_map_ctx_update_elem(&skops, map, key, flags); 854 err = sock_map_ctx_update_elem(&skops, map, key, flags);
847 fput(socket->file); 855 fput(socket->file);
848 return err; 856 return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 545b8c45a578..d906775e12c1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1006,7 +1006,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1006 /* ctx accesses must be at a fixed offset, so that we can 1006 /* ctx accesses must be at a fixed offset, so that we can
1007 * determine what type of data were returned. 1007 * determine what type of data were returned.
1008 */ 1008 */
1009 if (!tnum_is_const(reg->var_off)) { 1009 if (reg->off) {
1010 verbose(env,
1011 "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
1012 regno, reg->off, off - reg->off);
1013 return -EACCES;
1014 }
1015 if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
1010 char tn_buf[48]; 1016 char tn_buf[48];
1011 1017
1012 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 1018 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -1015,7 +1021,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1015 tn_buf, off, size); 1021 tn_buf, off, size);
1016 return -EACCES; 1022 return -EACCES;
1017 } 1023 }
1018 off += reg->var_off.value;
1019 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type); 1024 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
1020 if (!err && t == BPF_READ && value_regno >= 0) { 1025 if (!err && t == BPF_READ && value_regno >= 0) {
1021 /* ctx access returns either a scalar, or a 1026 /* ctx access returns either a scalar, or a
@@ -2341,12 +2346,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2341 2346
2342static void find_good_pkt_pointers(struct bpf_verifier_state *state, 2347static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2343 struct bpf_reg_state *dst_reg, 2348 struct bpf_reg_state *dst_reg,
2344 enum bpf_reg_type type) 2349 enum bpf_reg_type type,
2350 bool range_right_open)
2345{ 2351{
2346 struct bpf_reg_state *regs = state->regs, *reg; 2352 struct bpf_reg_state *regs = state->regs, *reg;
2353 u16 new_range;
2347 int i; 2354 int i;
2348 2355
2349 if (dst_reg->off < 0) 2356 if (dst_reg->off < 0 ||
2357 (dst_reg->off == 0 && range_right_open))
2350 /* This doesn't give us any range */ 2358 /* This doesn't give us any range */
2351 return; 2359 return;
2352 2360
@@ -2357,9 +2365,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2357 */ 2365 */
2358 return; 2366 return;
2359 2367
2360 /* LLVM can generate four kind of checks: 2368 new_range = dst_reg->off;
2369 if (range_right_open)
2370 new_range--;
2371
2372 /* Examples for register markings:
2361 * 2373 *
2362 * Type 1/2: 2374 * pkt_data in dst register:
2363 * 2375 *
2364 * r2 = r3; 2376 * r2 = r3;
2365 * r2 += 8; 2377 * r2 += 8;
@@ -2376,7 +2388,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2376 * r2=pkt(id=n,off=8,r=0) 2388 * r2=pkt(id=n,off=8,r=0)
2377 * r3=pkt(id=n,off=0,r=0) 2389 * r3=pkt(id=n,off=0,r=0)
2378 * 2390 *
2379 * Type 3/4: 2391 * pkt_data in src register:
2380 * 2392 *
2381 * r2 = r3; 2393 * r2 = r3;
2382 * r2 += 8; 2394 * r2 += 8;
@@ -2394,7 +2406,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2394 * r3=pkt(id=n,off=0,r=0) 2406 * r3=pkt(id=n,off=0,r=0)
2395 * 2407 *
2396 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) 2408 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
2397 * so that range of bytes [r3, r3 + 8) is safe to access. 2409 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
2410 * and [r3, r3 + 8-1) respectively is safe to access depending on
2411 * the check.
2398 */ 2412 */
2399 2413
2400 /* If our ids match, then we must have the same max_value. And we 2414 /* If our ids match, then we must have the same max_value. And we
@@ -2405,14 +2419,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2405 for (i = 0; i < MAX_BPF_REG; i++) 2419 for (i = 0; i < MAX_BPF_REG; i++)
2406 if (regs[i].type == type && regs[i].id == dst_reg->id) 2420 if (regs[i].type == type && regs[i].id == dst_reg->id)
2407 /* keep the maximum range already checked */ 2421 /* keep the maximum range already checked */
2408 regs[i].range = max_t(u16, regs[i].range, dst_reg->off); 2422 regs[i].range = max(regs[i].range, new_range);
2409 2423
2410 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 2424 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
2411 if (state->stack_slot_type[i] != STACK_SPILL) 2425 if (state->stack_slot_type[i] != STACK_SPILL)
2412 continue; 2426 continue;
2413 reg = &state->spilled_regs[i / BPF_REG_SIZE]; 2427 reg = &state->spilled_regs[i / BPF_REG_SIZE];
2414 if (reg->type == type && reg->id == dst_reg->id) 2428 if (reg->type == type && reg->id == dst_reg->id)
2415 reg->range = max_t(u16, reg->range, dst_reg->off); 2429 reg->range = max_t(u16, reg->range, new_range);
2416 } 2430 }
2417} 2431}
2418 2432
@@ -2776,39 +2790,71 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
2776 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && 2790 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
2777 dst_reg->type == PTR_TO_PACKET && 2791 dst_reg->type == PTR_TO_PACKET &&
2778 regs[insn->src_reg].type == PTR_TO_PACKET_END) { 2792 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2779 find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); 2793 /* pkt_data' > pkt_end */
2794 find_good_pkt_pointers(this_branch, dst_reg,
2795 PTR_TO_PACKET, false);
2796 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
2797 dst_reg->type == PTR_TO_PACKET_END &&
2798 regs[insn->src_reg].type == PTR_TO_PACKET) {
2799 /* pkt_end > pkt_data' */
2800 find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
2801 PTR_TO_PACKET, true);
2802 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
2803 dst_reg->type == PTR_TO_PACKET &&
2804 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2805 /* pkt_data' < pkt_end */
2806 find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET,
2807 true);
2780 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && 2808 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
2809 dst_reg->type == PTR_TO_PACKET_END &&
2810 regs[insn->src_reg].type == PTR_TO_PACKET) {
2811 /* pkt_end < pkt_data' */
2812 find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
2813 PTR_TO_PACKET, false);
2814 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2781 dst_reg->type == PTR_TO_PACKET && 2815 dst_reg->type == PTR_TO_PACKET &&
2782 regs[insn->src_reg].type == PTR_TO_PACKET_END) { 2816 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2783 find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); 2817 /* pkt_data' >= pkt_end */
2818 find_good_pkt_pointers(this_branch, dst_reg,
2819 PTR_TO_PACKET, true);
2784 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && 2820 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2785 dst_reg->type == PTR_TO_PACKET_END && 2821 dst_reg->type == PTR_TO_PACKET_END &&
2786 regs[insn->src_reg].type == PTR_TO_PACKET) { 2822 regs[insn->src_reg].type == PTR_TO_PACKET) {
2823 /* pkt_end >= pkt_data' */
2787 find_good_pkt_pointers(other_branch, &regs[insn->src_reg], 2824 find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
2788 PTR_TO_PACKET); 2825 PTR_TO_PACKET, false);
2826 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
2827 dst_reg->type == PTR_TO_PACKET &&
2828 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2829 /* pkt_data' <= pkt_end */
2830 find_good_pkt_pointers(other_branch, dst_reg,
2831 PTR_TO_PACKET, false);
2789 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && 2832 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
2790 dst_reg->type == PTR_TO_PACKET_END && 2833 dst_reg->type == PTR_TO_PACKET_END &&
2791 regs[insn->src_reg].type == PTR_TO_PACKET) { 2834 regs[insn->src_reg].type == PTR_TO_PACKET) {
2835 /* pkt_end <= pkt_data' */
2792 find_good_pkt_pointers(this_branch, &regs[insn->src_reg], 2836 find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
2793 PTR_TO_PACKET); 2837 PTR_TO_PACKET, true);
2794 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && 2838 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
2795 dst_reg->type == PTR_TO_PACKET_META && 2839 dst_reg->type == PTR_TO_PACKET_META &&
2796 reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) { 2840 reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
2797 find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); 2841 find_good_pkt_pointers(this_branch, dst_reg,
2842 PTR_TO_PACKET_META, false);
2798 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && 2843 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
2799 dst_reg->type == PTR_TO_PACKET_META && 2844 dst_reg->type == PTR_TO_PACKET_META &&
2800 reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) { 2845 reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
2801 find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); 2846 find_good_pkt_pointers(other_branch, dst_reg,
2847 PTR_TO_PACKET_META, false);
2802 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && 2848 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2803 reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && 2849 reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
2804 regs[insn->src_reg].type == PTR_TO_PACKET_META) { 2850 regs[insn->src_reg].type == PTR_TO_PACKET_META) {
2805 find_good_pkt_pointers(other_branch, &regs[insn->src_reg], 2851 find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
2806 PTR_TO_PACKET_META); 2852 PTR_TO_PACKET_META, false);
2807 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && 2853 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
2808 reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && 2854 reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
2809 regs[insn->src_reg].type == PTR_TO_PACKET_META) { 2855 regs[insn->src_reg].type == PTR_TO_PACKET_META) {
2810 find_good_pkt_pointers(this_branch, &regs[insn->src_reg], 2856 find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
2811 PTR_TO_PACKET_META); 2857 PTR_TO_PACKET_META, false);
2812 } else if (is_pointer_value(env, insn->dst_reg)) { 2858 } else if (is_pointer_value(env, insn->dst_reg)) {
2813 verbose(env, "R%d pointer comparison prohibited\n", 2859 verbose(env, "R%d pointer comparison prohibited\n",
2814 insn->dst_reg); 2860 insn->dst_reg);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 902149f05381..31ee304a5844 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
662 /* 662 /*
663 * Do not update time when cgroup is not active 663 * Do not update time when cgroup is not active
664 */ 664 */
665 if (cgrp == event->cgrp) 665 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
666 __update_cgrp_time(event->cgrp); 666 __update_cgrp_time(event->cgrp);
667} 667}
668 668
@@ -8966,6 +8966,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8966 8966
8967static void free_pmu_context(struct pmu *pmu) 8967static void free_pmu_context(struct pmu *pmu)
8968{ 8968{
8969 /*
8970 * Static contexts such as perf_sw_context have a global lifetime
8971 * and may be shared between different PMUs. Avoid freeing them
8972 * when a single PMU is going away.
8973 */
8974 if (pmu->task_ctx_nr > perf_invalid_context)
8975 return;
8976
8969 mutex_lock(&pmus_lock); 8977 mutex_lock(&pmus_lock);
8970 free_percpu(pmu->pmu_cpu_context); 8978 free_percpu(pmu->pmu_cpu_context);
8971 mutex_unlock(&pmus_lock); 8979 mutex_unlock(&pmus_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cd53e92147..f6cad39f35df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1610 if (!infop) 1610 if (!infop)
1611 return err; 1611 return err;
1612 1612
1613 if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1614 return -EFAULT;
1615
1613 user_access_begin(); 1616 user_access_begin();
1614 unsafe_put_user(signo, &infop->si_signo, Efault); 1617 unsafe_put_user(signo, &infop->si_signo, Efault);
1615 unsafe_put_user(0, &infop->si_errno, Efault); 1618 unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
1735 if (!infop) 1738 if (!infop)
1736 return err; 1739 return err;
1737 1740
1741 if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1742 return -EFAULT;
1743
1738 user_access_begin(); 1744 user_access_begin();
1739 unsafe_put_user(signo, &infop->si_signo, Efault); 1745 unsafe_put_user(signo, &infop->si_signo, Efault);
1740 unsafe_put_user(0, &infop->si_errno, Efault); 1746 unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
215 if (!s) 215 if (!s)
216 continue; 216 continue;
217 217
218#ifdef CONFIG_DEBUG_KMEMLEAK
219 /* Clear stale pointers from reused stack. */
220 memset(s->addr, 0, THREAD_SIZE);
221#endif
218 tsk->stack_vm_area = s; 222 tsk->stack_vm_area = s;
219 return s->addr; 223 return s->addr;
220 } 224 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..5a2ef92c2782 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
265 irq_setup_affinity(desc); 265 irq_setup_affinity(desc);
266 break; 266 break;
267 case IRQ_STARTUP_MANAGED: 267 case IRQ_STARTUP_MANAGED:
268 irq_do_set_affinity(d, aff, false);
268 ret = __irq_startup(desc); 269 ret = __irq_startup(desc);
269 irq_set_affinity_locked(d, aff, false);
270 break; 270 break;
271 case IRQ_STARTUP_ABORT: 271 case IRQ_STARTUP_ABORT:
272 return 0; 272 return 0;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
18static inline bool irq_needs_fixup(struct irq_data *d) 18static inline bool irq_needs_fixup(struct irq_data *d)
19{ 19{
20 const struct cpumask *m = irq_data_get_effective_affinity_mask(d); 20 const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
21 unsigned int cpu = smp_processor_id();
21 22
22 return cpumask_test_cpu(smp_processor_id(), m); 23#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
24 /*
25 * The cpumask_empty() check is a workaround for interrupt chips,
26 * which do not implement effective affinity, but the architecture has
27 * enabled the config switch. Use the general affinity mask instead.
28 */
29 if (cpumask_empty(m))
30 m = irq_data_get_affinity_mask(d);
31
32 /*
33 * Sanity check. If the mask is not empty when excluding the outgoing
34 * CPU then it must contain at least one online CPU. The outgoing CPU
35 * has been removed from the online mask already.
36 */
37 if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
38 cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
39 /*
40 * If this happens then there was a missed IRQ fixup at some
41 * point. Warn about it and enforce fixup.
42 */
43 pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
44 cpumask_pr_args(m), d->irq, cpu);
45 return true;
46 }
47#endif
48 return cpumask_test_cpu(cpu, m);
23} 49}
24 50
25static bool migrate_one_irq(struct irq_desc *desc) 51static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d00132b5c325..4bff6a10ae8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
168 set_bit(IRQTF_AFFINITY, &action->thread_flags); 168 set_bit(IRQTF_AFFINITY, &action->thread_flags);
169} 169}
170 170
171static void irq_validate_effective_affinity(struct irq_data *data)
172{
173#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
174 const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
175 struct irq_chip *chip = irq_data_get_irq_chip(data);
176
177 if (!cpumask_empty(m))
178 return;
179 pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
180 chip->name, data->irq);
181#endif
182}
183
171int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, 184int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
172 bool force) 185 bool force)
173{ 186{
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
175 struct irq_chip *chip = irq_data_get_irq_chip(data); 188 struct irq_chip *chip = irq_data_get_irq_chip(data);
176 int ret; 189 int ret;
177 190
191 if (!chip || !chip->irq_set_affinity)
192 return -EINVAL;
193
178 ret = chip->irq_set_affinity(data, mask, force); 194 ret = chip->irq_set_affinity(data, mask, force);
179 switch (ret) { 195 switch (ret) {
180 case IRQ_SET_MASK_OK: 196 case IRQ_SET_MASK_OK:
181 case IRQ_SET_MASK_OK_DONE: 197 case IRQ_SET_MASK_OK_DONE:
182 cpumask_copy(desc->irq_common_data.affinity, mask); 198 cpumask_copy(desc->irq_common_data.affinity, mask);
183 case IRQ_SET_MASK_OK_NOCOPY: 199 case IRQ_SET_MASK_OK_NOCOPY:
200 irq_validate_effective_affinity(data);
184 irq_set_thread_affinity(desc); 201 irq_set_thread_affinity(desc);
185 ret = 0; 202 ret = 0;
186 } 203 }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..bf8c8fd72589 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
830} 830}
831EXPORT_SYMBOL_GPL(klp_register_patch); 831EXPORT_SYMBOL_GPL(klp_register_patch);
832 832
833/*
834 * Remove parts of patches that touch a given kernel module. The list of
835 * patches processed might be limited. When limit is NULL, all patches
836 * will be handled.
837 */
838static void klp_cleanup_module_patches_limited(struct module *mod,
839 struct klp_patch *limit)
840{
841 struct klp_patch *patch;
842 struct klp_object *obj;
843
844 list_for_each_entry(patch, &klp_patches, list) {
845 if (patch == limit)
846 break;
847
848 klp_for_each_object(patch, obj) {
849 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
850 continue;
851
852 /*
853 * Only unpatch the module if the patch is enabled or
854 * is in transition.
855 */
856 if (patch->enabled || patch == klp_transition_patch) {
857 pr_notice("reverting patch '%s' on unloading module '%s'\n",
858 patch->mod->name, obj->mod->name);
859 klp_unpatch_object(obj);
860 }
861
862 klp_free_object_loaded(obj);
863 break;
864 }
865 }
866}
867
833int klp_module_coming(struct module *mod) 868int klp_module_coming(struct module *mod)
834{ 869{
835 int ret; 870 int ret;
@@ -894,7 +929,7 @@ err:
894 pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", 929 pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
895 patch->mod->name, obj->mod->name, obj->mod->name); 930 patch->mod->name, obj->mod->name, obj->mod->name);
896 mod->klp_alive = false; 931 mod->klp_alive = false;
897 klp_free_object_loaded(obj); 932 klp_cleanup_module_patches_limited(mod, patch);
898 mutex_unlock(&klp_mutex); 933 mutex_unlock(&klp_mutex);
899 934
900 return ret; 935 return ret;
@@ -902,9 +937,6 @@ err:
902 937
903void klp_module_going(struct module *mod) 938void klp_module_going(struct module *mod)
904{ 939{
905 struct klp_patch *patch;
906 struct klp_object *obj;
907
908 if (WARN_ON(mod->state != MODULE_STATE_GOING && 940 if (WARN_ON(mod->state != MODULE_STATE_GOING &&
909 mod->state != MODULE_STATE_COMING)) 941 mod->state != MODULE_STATE_COMING))
910 return; 942 return;
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
917 */ 949 */
918 mod->klp_alive = false; 950 mod->klp_alive = false;
919 951
920 list_for_each_entry(patch, &klp_patches, list) { 952 klp_cleanup_module_patches_limited(mod, NULL);
921 klp_for_each_object(patch, obj) {
922 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
923 continue;
924
925 /*
926 * Only unpatch the module if the patch is enabled or
927 * is in transition.
928 */
929 if (patch->enabled || patch == klp_transition_patch) {
930 pr_notice("reverting patch '%s' on unloading module '%s'\n",
931 patch->mod->name, obj->mod->name);
932 klp_unpatch_object(obj);
933 }
934
935 klp_free_object_loaded(obj);
936 break;
937 }
938 }
939 953
940 mutex_unlock(&klp_mutex); 954 mutex_unlock(&klp_mutex);
941} 955}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..e36e652d996f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1873 struct held_lock *next, int distance, struct stack_trace *trace, 1873 struct held_lock *next, int distance, struct stack_trace *trace,
1874 int (*save)(struct stack_trace *trace)) 1874 int (*save)(struct stack_trace *trace))
1875{ 1875{
1876 struct lock_list *uninitialized_var(target_entry);
1876 struct lock_list *entry; 1877 struct lock_list *entry;
1877 int ret;
1878 struct lock_list this; 1878 struct lock_list this;
1879 struct lock_list *uninitialized_var(target_entry); 1879 int ret;
1880 1880
1881 /* 1881 /*
1882 * Prove that the new <prev> -> <next> dependency would not 1882 * Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1890 this.class = hlock_class(next); 1890 this.class = hlock_class(next);
1891 this.parent = NULL; 1891 this.parent = NULL;
1892 ret = check_noncircular(&this, hlock_class(prev), &target_entry); 1892 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1893 if (unlikely(!ret)) 1893 if (unlikely(!ret)) {
1894 if (!trace->entries) {
1895 /*
1896 * If @save fails here, the printing might trigger
1897 * a WARN but because of the !nr_entries it should
1898 * not do bad things.
1899 */
1900 save(trace);
1901 }
1894 return print_circular_bug(&this, target_entry, next, prev, trace); 1902 return print_circular_bug(&this, target_entry, next, prev, trace);
1903 }
1895 else if (unlikely(ret < 0)) 1904 else if (unlikely(ret < 0))
1896 return print_bfs_bug(ret); 1905 return print_bfs_bug(ret);
1897 1906
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1938 return print_bfs_bug(ret); 1947 return print_bfs_bug(ret);
1939 1948
1940 1949
1941 if (save && !save(trace)) 1950 if (!trace->entries && !save(trace))
1942 return 0; 1951 return 0;
1943 1952
1944 /* 1953 /*
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1958 if (!ret) 1967 if (!ret)
1959 return 0; 1968 return 0;
1960 1969
1961 /*
1962 * Debugging printouts:
1963 */
1964 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
1965 graph_unlock();
1966 printk("\n new dependency: ");
1967 print_lock_name(hlock_class(prev));
1968 printk(KERN_CONT " => ");
1969 print_lock_name(hlock_class(next));
1970 printk(KERN_CONT "\n");
1971 dump_stack();
1972 if (!graph_lock())
1973 return 0;
1974 }
1975 return 2; 1970 return 2;
1976} 1971}
1977 1972
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1986{ 1981{
1987 int depth = curr->lockdep_depth; 1982 int depth = curr->lockdep_depth;
1988 struct held_lock *hlock; 1983 struct held_lock *hlock;
1989 struct stack_trace trace; 1984 struct stack_trace trace = {
1990 int (*save)(struct stack_trace *trace) = save_trace; 1985 .nr_entries = 0,
1986 .max_entries = 0,
1987 .entries = NULL,
1988 .skip = 0,
1989 };
1991 1990
1992 /* 1991 /*
1993 * Debugging checks. 1992 * Debugging checks.
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
2018 */ 2017 */
2019 if (hlock->read != 2 && hlock->check) { 2018 if (hlock->read != 2 && hlock->check) {
2020 int ret = check_prev_add(curr, hlock, next, 2019 int ret = check_prev_add(curr, hlock, next,
2021 distance, &trace, save); 2020 distance, &trace, save_trace);
2022 if (!ret) 2021 if (!ret)
2023 return 0; 2022 return 0;
2024 2023
2025 /* 2024 /*
2026 * Stop saving stack_trace if save_trace() was
2027 * called at least once:
2028 */
2029 if (save && ret == 2)
2030 save = NULL;
2031
2032 /*
2033 * Stop after the first non-trylock entry, 2025 * Stop after the first non-trylock entry,
2034 * as non-trylock entries have added their 2026 * as non-trylock entries have added their
2035 * own direct dependencies already, so this 2027 * own direct dependencies already, so this
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
854/** 854/**
855 * call_srcu() - Queue a callback for invocation after an SRCU grace period 855 * call_srcu() - Queue a callback for invocation after an SRCU grace period
856 * @sp: srcu_struct in queue the callback 856 * @sp: srcu_struct in queue the callback
857 * @head: structure to be used for queueing the SRCU callback. 857 * @rhp: structure to be used for queueing the SRCU callback.
858 * @func: function to be invoked after the SRCU grace period 858 * @func: function to be invoked after the SRCU grace period
859 * 859 *
860 * The callback function will be invoked some time after a full SRCU 860 * The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
85} 85}
86 86
87/** 87/**
88 * rcu_sync_enter_start - Force readers onto slow path for multiple updates
89 * @rsp: Pointer to rcu_sync structure to use for synchronization
90 *
88 * Must be called after rcu_sync_init() and before first use. 91 * Must be called after rcu_sync_init() and before first use.
89 * 92 *
90 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() 93 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
142 145
143/** 146/**
144 * rcu_sync_func() - Callback function managing reader access to fastpath 147 * rcu_sync_func() - Callback function managing reader access to fastpath
145 * @rsp: Pointer to rcu_sync structure to use for synchronization 148 * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
146 * 149 *
147 * This function is passed to one of the call_rcu() functions by 150 * This function is passed to one of the call_rcu() functions by
148 * rcu_sync_exit(), so that it is invoked after a grace period following the 151 * rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
158 * rcu_sync_exit(). Otherwise, set all state back to idle so that readers 161 * rcu_sync_exit(). Otherwise, set all state back to idle so that readers
159 * can again use their fastpaths. 162 * can again use their fastpaths.
160 */ 163 */
161static void rcu_sync_func(struct rcu_head *rcu) 164static void rcu_sync_func(struct rcu_head *rhp)
162{ 165{
163 struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); 166 struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
164 unsigned long flags; 167 unsigned long flags;
165 168
166 BUG_ON(rsp->gp_state != GP_PASSED); 169 BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..3e3650e94ae6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3097 * read-side critical sections have completed. call_rcu_sched() assumes 3097 * read-side critical sections have completed. call_rcu_sched() assumes
3098 * that the read-side critical sections end on enabling of preemption 3098 * that the read-side critical sections end on enabling of preemption
3099 * or on voluntary preemption. 3099 * or on voluntary preemption.
3100 * RCU read-side critical sections are delimited by : 3100 * RCU read-side critical sections are delimited by:
3101 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR 3101 *
3102 * - anything that disables preemption. 3102 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
3103 * - anything that disables preemption.
3103 * 3104 *
3104 * These may be nested. 3105 * These may be nested.
3105 * 3106 *
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
3124 * handler. This means that read-side critical sections in process 3125 * handler. This means that read-side critical sections in process
3125 * context must not be interrupted by softirqs. This interface is to be 3126 * context must not be interrupted by softirqs. This interface is to be
3126 * used when most of the read-side critical sections are in softirq context. 3127 * used when most of the read-side critical sections are in softirq context.
3127 * RCU read-side critical sections are delimited by : 3128 * RCU read-side critical sections are delimited by:
3128 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. 3129 *
3129 * OR 3130 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
3130 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. 3131 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
3131 * These may be nested. 3132 *
3133 * These may be nested.
3132 * 3134 *
3133 * See the description of call_rcu() for more detailed information on 3135 * See the description of call_rcu() for more detailed information on
3134 * memory ordering guarantees. 3136 * memory ordering guarantees.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
5356 return 1; 5356 return 1;
5357} 5357}
5358 5358
5359struct llc_stats { 5359/*
5360 unsigned long nr_running; 5360 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5361 unsigned long load; 5361 * soonest. For the purpose of speed we only consider the waking and previous
5362 unsigned long capacity; 5362 * CPU.
5363 int has_capacity; 5363 *
5364}; 5364 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365 * will be) idle.
5366 *
5367 * wake_affine_weight() - considers the weight to reflect the average
5368 * scheduling latency of the CPUs. This seems to work
5369 * for the overloaded case.
5370 */
5365 5371
5366static bool get_llc_stats(struct llc_stats *stats, int cpu) 5372static bool
5373wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5374 int this_cpu, int prev_cpu, int sync)
5367{ 5375{
5368 struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5376 if (idle_cpu(this_cpu))
5369 5377 return true;
5370 if (!sds)
5371 return false;
5372 5378
5373 stats->nr_running = READ_ONCE(sds->nr_running); 5379 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5374 stats->load = READ_ONCE(sds->load); 5380 return true;
5375 stats->capacity = READ_ONCE(sds->capacity);
5376 stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377 5381
5378 return true; 5382 return false;
5379} 5383}
5380 5384
5381/*
5382 * Can a task be moved from prev_cpu to this_cpu without causing a load
5383 * imbalance that would trigger the load balancer?
5384 *
5385 * Since we're running on 'stale' values, we might in fact create an imbalance
5386 * but recomputing these values is expensive, as that'd mean iteration 2 cache
5387 * domains worth of CPUs.
5388 */
5389static bool 5385static bool
5390wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5386wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5391 int this_cpu, int prev_cpu, int sync) 5387 int this_cpu, int prev_cpu, int sync)
5392{ 5388{
5393 struct llc_stats prev_stats, this_stats;
5394 s64 this_eff_load, prev_eff_load; 5389 s64 this_eff_load, prev_eff_load;
5395 unsigned long task_load; 5390 unsigned long task_load;
5396 5391
5397 if (!get_llc_stats(&prev_stats, prev_cpu) || 5392 this_eff_load = target_load(this_cpu, sd->wake_idx);
5398 !get_llc_stats(&this_stats, this_cpu)) 5393 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5399 return false;
5400 5394
5401 /*
5402 * If sync wakeup then subtract the (maximum possible)
5403 * effect of the currently running task from the load
5404 * of the current LLC.
5405 */
5406 if (sync) { 5395 if (sync) {
5407 unsigned long current_load = task_h_load(current); 5396 unsigned long current_load = task_h_load(current);
5408 5397
5409 /* in this case load hits 0 and this LLC is considered 'idle' */ 5398 if (current_load > this_eff_load)
5410 if (current_load > this_stats.load)
5411 return true; 5399 return true;
5412 5400
5413 this_stats.load -= current_load; 5401 this_eff_load -= current_load;
5414 } 5402 }
5415 5403
5416 /*
5417 * The has_capacity stuff is not SMT aware, but by trying to balance
5418 * the nr_running on both ends we try and fill the domain at equal
5419 * rates, thereby first consuming cores before siblings.
5420 */
5421
5422 /* if the old cache has capacity, stay there */
5423 if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424 return false;
5425
5426 /* if this cache has capacity, come here */
5427 if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428 return true;
5429
5430 /*
5431 * Check to see if we can move the load without causing too much
5432 * imbalance.
5433 */
5434 task_load = task_h_load(p); 5404 task_load = task_h_load(p);
5435 5405
5436 this_eff_load = 100; 5406 this_eff_load += task_load;
5437 this_eff_load *= prev_stats.capacity; 5407 if (sched_feat(WA_BIAS))
5438 5408 this_eff_load *= 100;
5439 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 5409 this_eff_load *= capacity_of(prev_cpu);
5440 prev_eff_load *= this_stats.capacity;
5441 5410
5442 this_eff_load *= this_stats.load + task_load; 5411 prev_eff_load -= task_load;
5443 prev_eff_load *= prev_stats.load - task_load; 5412 if (sched_feat(WA_BIAS))
5413 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5414 prev_eff_load *= capacity_of(this_cpu);
5444 5415
5445 return this_eff_load <= prev_eff_load; 5416 return this_eff_load <= prev_eff_load;
5446} 5417}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449 int prev_cpu, int sync) 5420 int prev_cpu, int sync)
5450{ 5421{
5451 int this_cpu = smp_processor_id(); 5422 int this_cpu = smp_processor_id();
5452 bool affine; 5423 bool affine = false;
5453 5424
5454 /* 5425 if (sched_feat(WA_IDLE) && !affine)
5455 * Default to no affine wakeups; wake_affine() should not effect a task 5426 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5456 * placement the load-balancer feels inclined to undo. The conservative
5457 * option is therefore to not move tasks when they wake up.
5458 */
5459 affine = false;
5460 5427
5461 /* 5428 if (sched_feat(WA_WEIGHT) && !affine)
5462 * If the wakeup is across cache domains, try to evaluate if movement 5429 affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5463 * makes sense, otherwise rely on select_idle_siblings() to do
5464 * placement inside the cache domain.
5465 */
5466 if (!cpus_share_cache(prev_cpu, this_cpu))
5467 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468 5430
5469 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5431 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470 if (affine) { 5432 if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600 */ 7562 */
7601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7563static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602{ 7564{
7603 struct sched_domain_shared *shared = env->sd->shared;
7604 struct sched_domain *child = env->sd->child; 7565 struct sched_domain *child = env->sd->child;
7605 struct sched_group *sg = env->sd->groups; 7566 struct sched_group *sg = env->sd->groups;
7606 struct sg_lb_stats *local = &sds->local_stat; 7567 struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
7672 if (env->dst_rq->rd->overload != overload) 7633 if (env->dst_rq->rd->overload != overload)
7673 env->dst_rq->rd->overload = overload; 7634 env->dst_rq->rd->overload = overload;
7674 } 7635 }
7675
7676 if (!shared)
7677 return;
7678
7679 /*
7680 * Since these are sums over groups they can contain some CPUs
7681 * multiple times for the NUMA domains.
7682 *
7683 * Currently only wake_affine_llc() and find_busiest_group()
7684 * uses these numbers, only the last is affected by this problem.
7685 *
7686 * XXX fix that.
7687 */
7688 WRITE_ONCE(shared->nr_running, sds->total_running);
7689 WRITE_ONCE(shared->load, sds->total_load);
7690 WRITE_ONCE(shared->capacity, sds->total_capacity);
7691} 7636}
7692 7637
7693/** 7638/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
8098 int cpu, balance_cpu = -1; 8043 int cpu, balance_cpu = -1;
8099 8044
8100 /* 8045 /*
8046 * Ensure the balancing environment is consistent; can happen
8047 * when the softirq triggers 'during' hotplug.
8048 */
8049 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8050 return 0;
8051
8052 /*
8101 * In the newly idle case, we will allow all the cpu's 8053 * In the newly idle case, we will allow all the cpu's
8102 * to do the newly idle load balance. 8054 * to do the newly idle load balance.
8103 */ 8055 */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
83 83
84SCHED_FEAT(WA_IDLE, true)
85SCHED_FEAT(WA_WEIGHT, true)
86SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
18#include <linux/membarrier.h> 18#include <linux/membarrier.h>
19#include <linux/tick.h> 19#include <linux/tick.h>
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/atomic.h>
21 22
22#include "sched.h" /* for cpu_rq(). */ 23#include "sched.h" /* for cpu_rq(). */
23 24
@@ -26,21 +27,26 @@
26 * except MEMBARRIER_CMD_QUERY. 27 * except MEMBARRIER_CMD_QUERY.
27 */ 28 */
28#define MEMBARRIER_CMD_BITMASK \ 29#define MEMBARRIER_CMD_BITMASK \
29 (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) 30 (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
30 32
31static void ipi_mb(void *info) 33static void ipi_mb(void *info)
32{ 34{
33 smp_mb(); /* IPIs should be serializing but paranoid. */ 35 smp_mb(); /* IPIs should be serializing but paranoid. */
34} 36}
35 37
36static void membarrier_private_expedited(void) 38static int membarrier_private_expedited(void)
37{ 39{
38 int cpu; 40 int cpu;
39 bool fallback = false; 41 bool fallback = false;
40 cpumask_var_t tmpmask; 42 cpumask_var_t tmpmask;
41 43
44 if (!(atomic_read(&current->mm->membarrier_state)
45 & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
46 return -EPERM;
47
42 if (num_online_cpus() == 1) 48 if (num_online_cpus() == 1)
43 return; 49 return 0;
44 50
45 /* 51 /*
46 * Matches memory barriers around rq->curr modification in 52 * Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
94 * rq->curr modification in scheduler. 100 * rq->curr modification in scheduler.
95 */ 101 */
96 smp_mb(); /* exit from system call is not a mb */ 102 smp_mb(); /* exit from system call is not a mb */
103 return 0;
104}
105
106static void membarrier_register_private_expedited(void)
107{
108 struct task_struct *p = current;
109 struct mm_struct *mm = p->mm;
110
111 /*
112 * We need to consider threads belonging to different thread
113 * groups, which use the same mm. (CLONE_VM but not
114 * CLONE_THREAD).
115 */
116 if (atomic_read(&mm->membarrier_state)
117 & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
118 return;
119 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
120 &mm->membarrier_state);
97} 121}
98 122
99/** 123/**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
144 synchronize_sched(); 168 synchronize_sched();
145 return 0; 169 return 0;
146 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 170 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
147 membarrier_private_expedited(); 171 return membarrier_private_expedited();
172 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
173 membarrier_register_private_expedited();
148 return 0; 174 return 0;
149 default: 175 default:
150 return -EINVAL; 176 return -EINVAL;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bb3a38005b9c..0ae832e13b97 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
473 return 0; 473 return 0;
474} 474}
475 475
476void __get_seccomp_filter(struct seccomp_filter *filter) 476static void __get_seccomp_filter(struct seccomp_filter *filter)
477{ 477{
478 /* Reference count is bounded by the number of total processes. */ 478 /* Reference count is bounded by the number of total processes. */
479 refcount_inc(&filter->usage); 479 refcount_inc(&filter->usage);