aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/inode.c1
-rw-r--r--kernel/bpf/sockmap.c43
-rw-r--r--kernel/bpf/verifier.c70
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/cpuhotplug.c28
-rw-r--r--kernel/irq/generic-chip.c15
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/livepatch/core.c60
-rw-r--r--kernel/locking/lockdep.c48
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/sync.c9
-rw-r--r--kernel/rcu/tree.c18
-rw-r--r--kernel/sched/fair.c140
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/membarrier.c34
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/workqueue.c37
24 files changed, 345 insertions, 225 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..e2636737b69b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
98 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); 98 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
99 99
100 if (array_size >= U32_MAX - PAGE_SIZE || 100 if (array_size >= U32_MAX - PAGE_SIZE ||
101 elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { 101 bpf_array_alloc_percpu(array)) {
102 bpf_map_area_free(array); 102 bpf_map_area_free(array);
103 return ERR_PTR(-ENOMEM); 103 return ERR_PTR(-ENOMEM);
104 } 104 }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e745d6a88224 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -69,7 +69,7 @@ static LIST_HEAD(dev_map_list);
69 69
70static u64 dev_map_bitmap_size(const union bpf_attr *attr) 70static u64 dev_map_bitmap_size(const union bpf_attr *attr)
71{ 71{
72 return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); 72 return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
73} 73}
74 74
75static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 75static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -78,6 +78,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
78 int err = -EINVAL; 78 int err = -EINVAL;
79 u64 cost; 79 u64 cost;
80 80
81 if (!capable(CAP_NET_ADMIN))
82 return ERR_PTR(-EPERM);
83
81 /* check sanity of attributes */ 84 /* check sanity of attributes */
82 if (attr->max_entries == 0 || attr->key_size != 4 || 85 if (attr->max_entries == 0 || attr->key_size != 4 ||
83 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) 86 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
@@ -111,8 +114,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
111 err = -ENOMEM; 114 err = -ENOMEM;
112 115
113 /* A per cpu bitfield with a bit per possible net device */ 116 /* A per cpu bitfield with a bit per possible net device */
114 dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), 117 dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
115 __alignof__(unsigned long)); 118 __alignof__(unsigned long),
119 GFP_KERNEL | __GFP_NOWARN);
116 if (!dtab->flush_needed) 120 if (!dtab->flush_needed)
117 goto free_dtab; 121 goto free_dtab;
118 122
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..6533f08d1238 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
317 */ 317 */
318 goto free_htab; 318 goto free_htab;
319 319
320 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
321 /* make sure the size for pcpu_alloc() is reasonable */
322 goto free_htab;
323
324 htab->elem_size = sizeof(struct htab_elem) + 320 htab->elem_size = sizeof(struct htab_elem) +
325 round_up(htab->map.key_size, 8); 321 round_up(htab->map.key_size, 8);
326 if (percpu) 322 if (percpu)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
363 putname(pname); 363 putname(pname);
364 return ret; 364 return ret;
365} 365}
366EXPORT_SYMBOL_GPL(bpf_obj_get_user);
366 367
367static void bpf_evict_inode(struct inode *inode) 368static void bpf_evict_inode(struct inode *inode)
368{ 369{
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..66f00a2b27f4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -39,6 +39,7 @@
39#include <linux/workqueue.h> 39#include <linux/workqueue.h>
40#include <linux/list.h> 40#include <linux/list.h>
41#include <net/strparser.h> 41#include <net/strparser.h>
42#include <net/tcp.h>
42 43
43struct bpf_stab { 44struct bpf_stab {
44 struct bpf_map map; 45 struct bpf_map map;
@@ -92,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
92 return rcu_dereference_sk_user_data(sk); 93 return rcu_dereference_sk_user_data(sk);
93} 94}
94 95
96/* compute the linear packet data range [data, data_end) for skb when
97 * sk_skb type programs are in use.
98 */
99static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
100{
101 TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
102}
103
95static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) 104static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
96{ 105{
97 struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); 106 struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -101,12 +110,20 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
101 return SK_DROP; 110 return SK_DROP;
102 111
103 skb_orphan(skb); 112 skb_orphan(skb);
113 /* We need to ensure that BPF metadata for maps is also cleared
114 * when we orphan the skb so that we don't have the possibility
115 * to reference a stale map.
116 */
117 TCP_SKB_CB(skb)->bpf.map = NULL;
104 skb->sk = psock->sock; 118 skb->sk = psock->sock;
105 bpf_compute_data_end(skb); 119 bpf_compute_data_end_sk_skb(skb);
120 preempt_disable();
106 rc = (*prog->bpf_func)(skb, prog->insnsi); 121 rc = (*prog->bpf_func)(skb, prog->insnsi);
122 preempt_enable();
107 skb->sk = NULL; 123 skb->sk = NULL;
108 124
109 return rc; 125 return rc == SK_PASS ?
126 (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP;
110} 127}
111 128
112static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) 129static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
@@ -114,17 +131,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
114 struct sock *sk; 131 struct sock *sk;
115 int rc; 132 int rc;
116 133
117 /* Because we use per cpu values to feed input from sock redirect
118 * in BPF program to do_sk_redirect_map() call we need to ensure we
119 * are not preempted. RCU read lock is not sufficient in this case
120 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
121 */
122 preempt_disable();
123 rc = smap_verdict_func(psock, skb); 134 rc = smap_verdict_func(psock, skb);
124 switch (rc) { 135 switch (rc) {
125 case SK_REDIRECT: 136 case SK_REDIRECT:
126 sk = do_sk_redirect_map(); 137 sk = do_sk_redirect_map(skb);
127 preempt_enable();
128 if (likely(sk)) { 138 if (likely(sk)) {
129 struct smap_psock *peer = smap_psock_sk(sk); 139 struct smap_psock *peer = smap_psock_sk(sk);
130 140
@@ -141,8 +151,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
141 /* Fall through and free skb otherwise */ 151 /* Fall through and free skb otherwise */
142 case SK_DROP: 152 case SK_DROP:
143 default: 153 default:
144 if (rc != SK_REDIRECT)
145 preempt_enable();
146 kfree_skb(skb); 154 kfree_skb(skb);
147 } 155 }
148} 156}
@@ -369,7 +377,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
369 * any socket yet. 377 * any socket yet.
370 */ 378 */
371 skb->sk = psock->sock; 379 skb->sk = psock->sock;
372 bpf_compute_data_end(skb); 380 bpf_compute_data_end_sk_skb(skb);
373 rc = (*prog->bpf_func)(skb, prog->insnsi); 381 rc = (*prog->bpf_func)(skb, prog->insnsi);
374 skb->sk = NULL; 382 skb->sk = NULL;
375 rcu_read_unlock(); 383 rcu_read_unlock();
@@ -487,6 +495,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
487 int err = -EINVAL; 495 int err = -EINVAL;
488 u64 cost; 496 u64 cost;
489 497
498 if (!capable(CAP_NET_ADMIN))
499 return ERR_PTR(-EPERM);
500
490 /* check sanity of attributes */ 501 /* check sanity of attributes */
491 if (attr->max_entries == 0 || attr->key_size != 4 || 502 if (attr->max_entries == 0 || attr->key_size != 4 ||
492 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) 503 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
@@ -840,6 +851,12 @@ static int sock_map_update_elem(struct bpf_map *map,
840 return -EINVAL; 851 return -EINVAL;
841 } 852 }
842 853
854 if (skops.sk->sk_type != SOCK_STREAM ||
855 skops.sk->sk_protocol != IPPROTO_TCP) {
856 fput(socket->file);
857 return -EOPNOTSUPP;
858 }
859
843 err = sock_map_ctx_update_elem(&skops, map, key, flags); 860 err = sock_map_ctx_update_elem(&skops, map, key, flags);
844 fput(socket->file); 861 fput(socket->file);
845 return err; 862 return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..c48ca2a34b5e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
653{ 653{
654 struct bpf_verifier_state *parent = state->parent; 654 struct bpf_verifier_state *parent = state->parent;
655 655
656 if (regno == BPF_REG_FP)
657 /* We don't need to worry about FP liveness because it's read-only */
658 return;
659
656 while (parent) { 660 while (parent) {
657 /* if read wasn't screened by an earlier write ... */ 661 /* if read wasn't screened by an earlier write ... */
658 if (state->regs[regno].live & REG_LIVE_WRITTEN) 662 if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -1112,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1112 /* ctx accesses must be at a fixed offset, so that we can 1116 /* ctx accesses must be at a fixed offset, so that we can
1113 * determine what type of data were returned. 1117 * determine what type of data were returned.
1114 */ 1118 */
1115 if (!tnum_is_const(reg->var_off)) { 1119 if (reg->off) {
1120 verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
1121 regno, reg->off, off - reg->off);
1122 return -EACCES;
1123 }
1124 if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
1116 char tn_buf[48]; 1125 char tn_buf[48];
1117 1126
1118 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 1127 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -1120,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1120 tn_buf, off, size); 1129 tn_buf, off, size);
1121 return -EACCES; 1130 return -EACCES;
1122 } 1131 }
1123 off += reg->var_off.value;
1124 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type); 1132 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
1125 if (!err && t == BPF_READ && value_regno >= 0) { 1133 if (!err && t == BPF_READ && value_regno >= 0) {
1126 /* ctx access returns either a scalar, or a 1134 /* ctx access returns either a scalar, or a
@@ -2345,6 +2353,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2345 * copy register state to dest reg 2353 * copy register state to dest reg
2346 */ 2354 */
2347 regs[insn->dst_reg] = regs[insn->src_reg]; 2355 regs[insn->dst_reg] = regs[insn->src_reg];
2356 regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
2348 } else { 2357 } else {
2349 /* R1 = (u32) R2 */ 2358 /* R1 = (u32) R2 */
2350 if (is_pointer_value(env, insn->src_reg)) { 2359 if (is_pointer_value(env, insn->src_reg)) {
@@ -2421,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2421} 2430}
2422 2431
2423static void find_good_pkt_pointers(struct bpf_verifier_state *state, 2432static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2424 struct bpf_reg_state *dst_reg) 2433 struct bpf_reg_state *dst_reg,
2434 bool range_right_open)
2425{ 2435{
2426 struct bpf_reg_state *regs = state->regs, *reg; 2436 struct bpf_reg_state *regs = state->regs, *reg;
2437 u16 new_range;
2427 int i; 2438 int i;
2428 2439
2429 if (dst_reg->off < 0) 2440 if (dst_reg->off < 0 ||
2441 (dst_reg->off == 0 && range_right_open))
2430 /* This doesn't give us any range */ 2442 /* This doesn't give us any range */
2431 return; 2443 return;
2432 2444
@@ -2437,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2437 */ 2449 */
2438 return; 2450 return;
2439 2451
2440 /* LLVM can generate four kind of checks: 2452 new_range = dst_reg->off;
2453 if (range_right_open)
2454 new_range--;
2455
2456 /* Examples for register markings:
2441 * 2457 *
2442 * Type 1/2: 2458 * pkt_data in dst register:
2443 * 2459 *
2444 * r2 = r3; 2460 * r2 = r3;
2445 * r2 += 8; 2461 * r2 += 8;
@@ -2456,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2456 * r2=pkt(id=n,off=8,r=0) 2472 * r2=pkt(id=n,off=8,r=0)
2457 * r3=pkt(id=n,off=0,r=0) 2473 * r3=pkt(id=n,off=0,r=0)
2458 * 2474 *
2459 * Type 3/4: 2475 * pkt_data in src register:
2460 * 2476 *
2461 * r2 = r3; 2477 * r2 = r3;
2462 * r2 += 8; 2478 * r2 += 8;
@@ -2474,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2474 * r3=pkt(id=n,off=0,r=0) 2490 * r3=pkt(id=n,off=0,r=0)
2475 * 2491 *
2476 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) 2492 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
2477 * so that range of bytes [r3, r3 + 8) is safe to access. 2493 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
2494 * and [r3, r3 + 8-1) respectively is safe to access depending on
2495 * the check.
2478 */ 2496 */
2479 2497
2480 /* If our ids match, then we must have the same max_value. And we 2498 /* If our ids match, then we must have the same max_value. And we
@@ -2485,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2485 for (i = 0; i < MAX_BPF_REG; i++) 2503 for (i = 0; i < MAX_BPF_REG; i++)
2486 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) 2504 if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
2487 /* keep the maximum range already checked */ 2505 /* keep the maximum range already checked */
2488 regs[i].range = max_t(u16, regs[i].range, dst_reg->off); 2506 regs[i].range = max(regs[i].range, new_range);
2489 2507
2490 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 2508 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
2491 if (state->stack_slot_type[i] != STACK_SPILL) 2509 if (state->stack_slot_type[i] != STACK_SPILL)
2492 continue; 2510 continue;
2493 reg = &state->spilled_regs[i / BPF_REG_SIZE]; 2511 reg = &state->spilled_regs[i / BPF_REG_SIZE];
2494 if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) 2512 if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
2495 reg->range = max_t(u16, reg->range, dst_reg->off); 2513 reg->range = max(reg->range, new_range);
2496 } 2514 }
2497} 2515}
2498 2516
@@ -2856,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
2856 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && 2874 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
2857 dst_reg->type == PTR_TO_PACKET && 2875 dst_reg->type == PTR_TO_PACKET &&
2858 regs[insn->src_reg].type == PTR_TO_PACKET_END) { 2876 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2859 find_good_pkt_pointers(this_branch, dst_reg); 2877 /* pkt_data' > pkt_end */
2878 find_good_pkt_pointers(this_branch, dst_reg, false);
2879 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
2880 dst_reg->type == PTR_TO_PACKET_END &&
2881 regs[insn->src_reg].type == PTR_TO_PACKET) {
2882 /* pkt_end > pkt_data' */
2883 find_good_pkt_pointers(other_branch, &regs[insn->src_reg], true);
2860 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && 2884 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
2861 dst_reg->type == PTR_TO_PACKET && 2885 dst_reg->type == PTR_TO_PACKET &&
2862 regs[insn->src_reg].type == PTR_TO_PACKET_END) { 2886 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2863 find_good_pkt_pointers(other_branch, dst_reg); 2887 /* pkt_data' < pkt_end */
2888 find_good_pkt_pointers(other_branch, dst_reg, true);
2889 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
2890 dst_reg->type == PTR_TO_PACKET_END &&
2891 regs[insn->src_reg].type == PTR_TO_PACKET) {
2892 /* pkt_end < pkt_data' */
2893 find_good_pkt_pointers(this_branch, &regs[insn->src_reg], false);
2894 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2895 dst_reg->type == PTR_TO_PACKET &&
2896 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2897 /* pkt_data' >= pkt_end */
2898 find_good_pkt_pointers(this_branch, dst_reg, true);
2864 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && 2899 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
2865 dst_reg->type == PTR_TO_PACKET_END && 2900 dst_reg->type == PTR_TO_PACKET_END &&
2866 regs[insn->src_reg].type == PTR_TO_PACKET) { 2901 regs[insn->src_reg].type == PTR_TO_PACKET) {
2867 find_good_pkt_pointers(other_branch, &regs[insn->src_reg]); 2902 /* pkt_end >= pkt_data' */
2903 find_good_pkt_pointers(other_branch, &regs[insn->src_reg], false);
2904 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
2905 dst_reg->type == PTR_TO_PACKET &&
2906 regs[insn->src_reg].type == PTR_TO_PACKET_END) {
2907 /* pkt_data' <= pkt_end */
2908 find_good_pkt_pointers(other_branch, dst_reg, false);
2868 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && 2909 } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
2869 dst_reg->type == PTR_TO_PACKET_END && 2910 dst_reg->type == PTR_TO_PACKET_END &&
2870 regs[insn->src_reg].type == PTR_TO_PACKET) { 2911 regs[insn->src_reg].type == PTR_TO_PACKET) {
2871 find_good_pkt_pointers(this_branch, &regs[insn->src_reg]); 2912 /* pkt_end <= pkt_data' */
2913 find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
2872 } else if (is_pointer_value(env, insn->dst_reg)) { 2914 } else if (is_pointer_value(env, insn->dst_reg)) {
2873 verbose("R%d pointer comparison prohibited\n", insn->dst_reg); 2915 verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
2874 return -EACCES; 2916 return -EACCES;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d851df22f5c5..04892a82f6ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
632 __cpuhp_kick_ap(st); 632 __cpuhp_kick_ap(st);
633 } 633 }
634 634
635 /*
636 * Clean up the leftovers so the next hotplug operation wont use stale
637 * data.
638 */
639 st->node = st->last = NULL;
635 return ret; 640 return ret;
636} 641}
637 642
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..9d93db81fa36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
662 /* 662 /*
663 * Do not update time when cgroup is not active 663 * Do not update time when cgroup is not active
664 */ 664 */
665 if (cgrp == event->cgrp) 665 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
666 __update_cgrp_time(event->cgrp); 666 __update_cgrp_time(event->cgrp);
667} 667}
668 668
@@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8955 8955
8956static void free_pmu_context(struct pmu *pmu) 8956static void free_pmu_context(struct pmu *pmu)
8957{ 8957{
8958 /*
8959 * Static contexts such as perf_sw_context have a global lifetime
8960 * and may be shared between different PMUs. Avoid freeing them
8961 * when a single PMU is going away.
8962 */
8963 if (pmu->task_ctx_nr > perf_invalid_context)
8964 return;
8965
8958 mutex_lock(&pmus_lock); 8966 mutex_lock(&pmus_lock);
8959 free_percpu(pmu->pmu_cpu_context); 8967 free_percpu(pmu->pmu_cpu_context);
8960 mutex_unlock(&pmus_lock); 8968 mutex_unlock(&pmus_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cd53e92147..f6cad39f35df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1610 if (!infop) 1610 if (!infop)
1611 return err; 1611 return err;
1612 1612
1613 if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1614 return -EFAULT;
1615
1613 user_access_begin(); 1616 user_access_begin();
1614 unsafe_put_user(signo, &infop->si_signo, Efault); 1617 unsafe_put_user(signo, &infop->si_signo, Efault);
1615 unsafe_put_user(0, &infop->si_errno, Efault); 1618 unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
1735 if (!infop) 1738 if (!infop)
1736 return err; 1739 return err;
1737 1740
1741 if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
1742 return -EFAULT;
1743
1738 user_access_begin(); 1744 user_access_begin();
1739 unsafe_put_user(signo, &infop->si_signo, Efault); 1745 unsafe_put_user(signo, &infop->si_signo, Efault);
1740 unsafe_put_user(0, &infop->si_errno, Efault); 1746 unsafe_put_user(0, &infop->si_errno, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
215 if (!s) 215 if (!s)
216 continue; 216 continue;
217 217
218#ifdef CONFIG_DEBUG_KMEMLEAK
219 /* Clear stale pointers from reused stack. */
220 memset(s->addr, 0, THREAD_SIZE);
221#endif
218 tsk->stack_vm_area = s; 222 tsk->stack_vm_area = s;
219 return s->addr; 223 return s->addr;
220 } 224 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..5a2ef92c2782 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
265 irq_setup_affinity(desc); 265 irq_setup_affinity(desc);
266 break; 266 break;
267 case IRQ_STARTUP_MANAGED: 267 case IRQ_STARTUP_MANAGED:
268 irq_do_set_affinity(d, aff, false);
268 ret = __irq_startup(desc); 269 ret = __irq_startup(desc);
269 irq_set_affinity_locked(d, aff, false);
270 break; 270 break;
271 case IRQ_STARTUP_ABORT: 271 case IRQ_STARTUP_ABORT:
272 return 0; 272 return 0;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c83d9f..9eb09aef0313 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
18static inline bool irq_needs_fixup(struct irq_data *d) 18static inline bool irq_needs_fixup(struct irq_data *d)
19{ 19{
20 const struct cpumask *m = irq_data_get_effective_affinity_mask(d); 20 const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
21 unsigned int cpu = smp_processor_id();
21 22
22 return cpumask_test_cpu(smp_processor_id(), m); 23#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
24 /*
25 * The cpumask_empty() check is a workaround for interrupt chips,
26 * which do not implement effective affinity, but the architecture has
27 * enabled the config switch. Use the general affinity mask instead.
28 */
29 if (cpumask_empty(m))
30 m = irq_data_get_affinity_mask(d);
31
32 /*
33 * Sanity check. If the mask is not empty when excluding the outgoing
34 * CPU then it must contain at least one online CPU. The outgoing CPU
35 * has been removed from the online mask already.
36 */
37 if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
38 cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
39 /*
40 * If this happens then there was a missed IRQ fixup at some
41 * point. Warn about it and enforce fixup.
42 */
43 pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
44 cpumask_pr_args(m), d->irq, cpu);
45 return true;
46 }
47#endif
48 return cpumask_test_cpu(cpu, m);
23} 49}
24 50
25static bool migrate_one_irq(struct irq_desc *desc) 51static bool migrate_one_irq(struct irq_desc *desc)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 5270a54b9fa4..c26c5bb6b491 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
135} 135}
136 136
137/** 137/**
138 * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt 138 * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt
139 * @d: irq_data 139 * @d: irq_data
140 *
141 * This generic implementation of the irq_mask_ack method is for chips
142 * with separate enable/disable registers instead of a single mask
143 * register and where a pending interrupt is acknowledged by setting a
144 * bit.
145 *
146 * Note: This is the only permutation currently used. Similar generic
147 * functions should be added here if other permutations are required.
140 */ 148 */
141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) 149void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
142{ 150{
143 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 151 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
144 struct irq_chip_type *ct = irq_data_get_chip_type(d); 152 struct irq_chip_type *ct = irq_data_get_chip_type(d);
145 u32 mask = d->mask; 153 u32 mask = d->mask;
146 154
147 irq_gc_lock(gc); 155 irq_gc_lock(gc);
148 irq_reg_writel(gc, mask, ct->regs.mask); 156 irq_reg_writel(gc, mask, ct->regs.disable);
157 *ct->mask_cache &= ~mask;
149 irq_reg_writel(gc, mask, ct->regs.ack); 158 irq_reg_writel(gc, mask, ct->regs.ack);
150 irq_gc_unlock(gc); 159 irq_gc_unlock(gc);
151} 160}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d00132b5c325..4bff6a10ae8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
168 set_bit(IRQTF_AFFINITY, &action->thread_flags); 168 set_bit(IRQTF_AFFINITY, &action->thread_flags);
169} 169}
170 170
171static void irq_validate_effective_affinity(struct irq_data *data)
172{
173#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
174 const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
175 struct irq_chip *chip = irq_data_get_irq_chip(data);
176
177 if (!cpumask_empty(m))
178 return;
179 pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
180 chip->name, data->irq);
181#endif
182}
183
171int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, 184int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
172 bool force) 185 bool force)
173{ 186{
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
175 struct irq_chip *chip = irq_data_get_irq_chip(data); 188 struct irq_chip *chip = irq_data_get_irq_chip(data);
176 int ret; 189 int ret;
177 190
191 if (!chip || !chip->irq_set_affinity)
192 return -EINVAL;
193
178 ret = chip->irq_set_affinity(data, mask, force); 194 ret = chip->irq_set_affinity(data, mask, force);
179 switch (ret) { 195 switch (ret) {
180 case IRQ_SET_MASK_OK: 196 case IRQ_SET_MASK_OK:
181 case IRQ_SET_MASK_OK_DONE: 197 case IRQ_SET_MASK_OK_DONE:
182 cpumask_copy(desc->irq_common_data.affinity, mask); 198 cpumask_copy(desc->irq_common_data.affinity, mask);
183 case IRQ_SET_MASK_OK_NOCOPY: 199 case IRQ_SET_MASK_OK_NOCOPY:
200 irq_validate_effective_affinity(data);
184 irq_set_thread_affinity(desc); 201 irq_set_thread_affinity(desc);
185 ret = 0; 202 ret = 0;
186 } 203 }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..bf8c8fd72589 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
830} 830}
831EXPORT_SYMBOL_GPL(klp_register_patch); 831EXPORT_SYMBOL_GPL(klp_register_patch);
832 832
833/*
834 * Remove parts of patches that touch a given kernel module. The list of
835 * patches processed might be limited. When limit is NULL, all patches
836 * will be handled.
837 */
838static void klp_cleanup_module_patches_limited(struct module *mod,
839 struct klp_patch *limit)
840{
841 struct klp_patch *patch;
842 struct klp_object *obj;
843
844 list_for_each_entry(patch, &klp_patches, list) {
845 if (patch == limit)
846 break;
847
848 klp_for_each_object(patch, obj) {
849 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
850 continue;
851
852 /*
853 * Only unpatch the module if the patch is enabled or
854 * is in transition.
855 */
856 if (patch->enabled || patch == klp_transition_patch) {
857 pr_notice("reverting patch '%s' on unloading module '%s'\n",
858 patch->mod->name, obj->mod->name);
859 klp_unpatch_object(obj);
860 }
861
862 klp_free_object_loaded(obj);
863 break;
864 }
865 }
866}
867
833int klp_module_coming(struct module *mod) 868int klp_module_coming(struct module *mod)
834{ 869{
835 int ret; 870 int ret;
@@ -894,7 +929,7 @@ err:
894 pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", 929 pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
895 patch->mod->name, obj->mod->name, obj->mod->name); 930 patch->mod->name, obj->mod->name, obj->mod->name);
896 mod->klp_alive = false; 931 mod->klp_alive = false;
897 klp_free_object_loaded(obj); 932 klp_cleanup_module_patches_limited(mod, patch);
898 mutex_unlock(&klp_mutex); 933 mutex_unlock(&klp_mutex);
899 934
900 return ret; 935 return ret;
@@ -902,9 +937,6 @@ err:
902 937
903void klp_module_going(struct module *mod) 938void klp_module_going(struct module *mod)
904{ 939{
905 struct klp_patch *patch;
906 struct klp_object *obj;
907
908 if (WARN_ON(mod->state != MODULE_STATE_GOING && 940 if (WARN_ON(mod->state != MODULE_STATE_GOING &&
909 mod->state != MODULE_STATE_COMING)) 941 mod->state != MODULE_STATE_COMING))
910 return; 942 return;
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
917 */ 949 */
918 mod->klp_alive = false; 950 mod->klp_alive = false;
919 951
920 list_for_each_entry(patch, &klp_patches, list) { 952 klp_cleanup_module_patches_limited(mod, NULL);
921 klp_for_each_object(patch, obj) {
922 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
923 continue;
924
925 /*
926 * Only unpatch the module if the patch is enabled or
927 * is in transition.
928 */
929 if (patch->enabled || patch == klp_transition_patch) {
930 pr_notice("reverting patch '%s' on unloading module '%s'\n",
931 patch->mod->name, obj->mod->name);
932 klp_unpatch_object(obj);
933 }
934
935 klp_free_object_loaded(obj);
936 break;
937 }
938 }
939 953
940 mutex_unlock(&klp_mutex); 954 mutex_unlock(&klp_mutex);
941} 955}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 44c8d0d17170..e36e652d996f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1873 struct held_lock *next, int distance, struct stack_trace *trace, 1873 struct held_lock *next, int distance, struct stack_trace *trace,
1874 int (*save)(struct stack_trace *trace)) 1874 int (*save)(struct stack_trace *trace))
1875{ 1875{
1876 struct lock_list *uninitialized_var(target_entry);
1876 struct lock_list *entry; 1877 struct lock_list *entry;
1877 int ret;
1878 struct lock_list this; 1878 struct lock_list this;
1879 struct lock_list *uninitialized_var(target_entry); 1879 int ret;
1880 1880
1881 /* 1881 /*
1882 * Prove that the new <prev> -> <next> dependency would not 1882 * Prove that the new <prev> -> <next> dependency would not
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1890 this.class = hlock_class(next); 1890 this.class = hlock_class(next);
1891 this.parent = NULL; 1891 this.parent = NULL;
1892 ret = check_noncircular(&this, hlock_class(prev), &target_entry); 1892 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1893 if (unlikely(!ret)) 1893 if (unlikely(!ret)) {
1894 if (!trace->entries) {
1895 /*
1896 * If @save fails here, the printing might trigger
1897 * a WARN but because of the !nr_entries it should
1898 * not do bad things.
1899 */
1900 save(trace);
1901 }
1894 return print_circular_bug(&this, target_entry, next, prev, trace); 1902 return print_circular_bug(&this, target_entry, next, prev, trace);
1903 }
1895 else if (unlikely(ret < 0)) 1904 else if (unlikely(ret < 0))
1896 return print_bfs_bug(ret); 1905 return print_bfs_bug(ret);
1897 1906
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1938 return print_bfs_bug(ret); 1947 return print_bfs_bug(ret);
1939 1948
1940 1949
1941 if (save && !save(trace)) 1950 if (!trace->entries && !save(trace))
1942 return 0; 1951 return 0;
1943 1952
1944 /* 1953 /*
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1958 if (!ret) 1967 if (!ret)
1959 return 0; 1968 return 0;
1960 1969
1961 /*
1962 * Debugging printouts:
1963 */
1964 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
1965 graph_unlock();
1966 printk("\n new dependency: ");
1967 print_lock_name(hlock_class(prev));
1968 printk(KERN_CONT " => ");
1969 print_lock_name(hlock_class(next));
1970 printk(KERN_CONT "\n");
1971 dump_stack();
1972 if (!graph_lock())
1973 return 0;
1974 }
1975 return 2; 1970 return 2;
1976} 1971}
1977 1972
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1986{ 1981{
1987 int depth = curr->lockdep_depth; 1982 int depth = curr->lockdep_depth;
1988 struct held_lock *hlock; 1983 struct held_lock *hlock;
1989 struct stack_trace trace; 1984 struct stack_trace trace = {
1990 int (*save)(struct stack_trace *trace) = save_trace; 1985 .nr_entries = 0,
1986 .max_entries = 0,
1987 .entries = NULL,
1988 .skip = 0,
1989 };
1991 1990
1992 /* 1991 /*
1993 * Debugging checks. 1992 * Debugging checks.
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
2018 */ 2017 */
2019 if (hlock->read != 2 && hlock->check) { 2018 if (hlock->read != 2 && hlock->check) {
2020 int ret = check_prev_add(curr, hlock, next, 2019 int ret = check_prev_add(curr, hlock, next,
2021 distance, &trace, save); 2020 distance, &trace, save_trace);
2022 if (!ret) 2021 if (!ret)
2023 return 0; 2022 return 0;
2024 2023
2025 /* 2024 /*
2026 * Stop saving stack_trace if save_trace() was
2027 * called at least once:
2028 */
2029 if (save && ret == 2)
2030 save = NULL;
2031
2032 /*
2033 * Stop after the first non-trylock entry, 2025 * Stop after the first non-trylock entry,
2034 * as non-trylock entries have added their 2026 * as non-trylock entries have added their
2035 * own direct dependencies already, so this 2027 * own direct dependencies already, so this
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
854/** 854/**
855 * call_srcu() - Queue a callback for invocation after an SRCU grace period 855 * call_srcu() - Queue a callback for invocation after an SRCU grace period
856 * @sp: srcu_struct in queue the callback 856 * @sp: srcu_struct in queue the callback
857 * @head: structure to be used for queueing the SRCU callback. 857 * @rhp: structure to be used for queueing the SRCU callback.
858 * @func: function to be invoked after the SRCU grace period 858 * @func: function to be invoked after the SRCU grace period
859 * 859 *
860 * The callback function will be invoked some time after a full SRCU 860 * The callback function will be invoked some time after a full SRCU
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 50d1861f7759..3f943efcf61c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
85} 85}
86 86
87/** 87/**
88 * rcu_sync_enter_start - Force readers onto slow path for multiple updates
89 * @rsp: Pointer to rcu_sync structure to use for synchronization
90 *
88 * Must be called after rcu_sync_init() and before first use. 91 * Must be called after rcu_sync_init() and before first use.
89 * 92 *
90 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() 93 * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
142 145
143/** 146/**
144 * rcu_sync_func() - Callback function managing reader access to fastpath 147 * rcu_sync_func() - Callback function managing reader access to fastpath
145 * @rsp: Pointer to rcu_sync structure to use for synchronization 148 * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
146 * 149 *
147 * This function is passed to one of the call_rcu() functions by 150 * This function is passed to one of the call_rcu() functions by
148 * rcu_sync_exit(), so that it is invoked after a grace period following the 151 * rcu_sync_exit(), so that it is invoked after a grace period following the
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
158 * rcu_sync_exit(). Otherwise, set all state back to idle so that readers 161 * rcu_sync_exit(). Otherwise, set all state back to idle so that readers
159 * can again use their fastpaths. 162 * can again use their fastpaths.
160 */ 163 */
161static void rcu_sync_func(struct rcu_head *rcu) 164static void rcu_sync_func(struct rcu_head *rhp)
162{ 165{
163 struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); 166 struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
164 unsigned long flags; 167 unsigned long flags;
165 168
166 BUG_ON(rsp->gp_state != GP_PASSED); 169 BUG_ON(rsp->gp_state != GP_PASSED);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..3e3650e94ae6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3097 * read-side critical sections have completed. call_rcu_sched() assumes 3097 * read-side critical sections have completed. call_rcu_sched() assumes
3098 * that the read-side critical sections end on enabling of preemption 3098 * that the read-side critical sections end on enabling of preemption
3099 * or on voluntary preemption. 3099 * or on voluntary preemption.
3100 * RCU read-side critical sections are delimited by : 3100 * RCU read-side critical sections are delimited by:
3101 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR 3101 *
3102 * - anything that disables preemption. 3102 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
3103 * - anything that disables preemption.
3103 * 3104 *
3104 * These may be nested. 3105 * These may be nested.
3105 * 3106 *
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
3124 * handler. This means that read-side critical sections in process 3125 * handler. This means that read-side critical sections in process
3125 * context must not be interrupted by softirqs. This interface is to be 3126 * context must not be interrupted by softirqs. This interface is to be
3126 * used when most of the read-side critical sections are in softirq context. 3127 * used when most of the read-side critical sections are in softirq context.
3127 * RCU read-side critical sections are delimited by : 3128 * RCU read-side critical sections are delimited by:
3128 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. 3129 *
3129 * OR 3130 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
3130 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. 3131 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
3131 * These may be nested. 3132 *
3133 * These may be nested.
3132 * 3134 *
3133 * See the description of call_rcu() for more detailed information on 3135 * See the description of call_rcu() for more detailed information on
3134 * memory ordering guarantees. 3136 * memory ordering guarantees.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
5356 return 1; 5356 return 1;
5357} 5357}
5358 5358
5359struct llc_stats { 5359/*
5360 unsigned long nr_running; 5360 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5361 unsigned long load; 5361 * soonest. For the purpose of speed we only consider the waking and previous
5362 unsigned long capacity; 5362 * CPU.
5363 int has_capacity; 5363 *
5364}; 5364 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365 * will be) idle.
5366 *
5367 * wake_affine_weight() - considers the weight to reflect the average
5368 * scheduling latency of the CPUs. This seems to work
5369 * for the overloaded case.
5370 */
5365 5371
5366static bool get_llc_stats(struct llc_stats *stats, int cpu) 5372static bool
5373wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5374 int this_cpu, int prev_cpu, int sync)
5367{ 5375{
5368 struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5376 if (idle_cpu(this_cpu))
5369 5377 return true;
5370 if (!sds)
5371 return false;
5372 5378
5373 stats->nr_running = READ_ONCE(sds->nr_running); 5379 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5374 stats->load = READ_ONCE(sds->load); 5380 return true;
5375 stats->capacity = READ_ONCE(sds->capacity);
5376 stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377 5381
5378 return true; 5382 return false;
5379} 5383}
5380 5384
5381/*
5382 * Can a task be moved from prev_cpu to this_cpu without causing a load
5383 * imbalance that would trigger the load balancer?
5384 *
5385 * Since we're running on 'stale' values, we might in fact create an imbalance
5386 * but recomputing these values is expensive, as that'd mean iteration 2 cache
5387 * domains worth of CPUs.
5388 */
5389static bool 5385static bool
5390wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5386wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5391 int this_cpu, int prev_cpu, int sync) 5387 int this_cpu, int prev_cpu, int sync)
5392{ 5388{
5393 struct llc_stats prev_stats, this_stats;
5394 s64 this_eff_load, prev_eff_load; 5389 s64 this_eff_load, prev_eff_load;
5395 unsigned long task_load; 5390 unsigned long task_load;
5396 5391
5397 if (!get_llc_stats(&prev_stats, prev_cpu) || 5392 this_eff_load = target_load(this_cpu, sd->wake_idx);
5398 !get_llc_stats(&this_stats, this_cpu)) 5393 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5399 return false;
5400 5394
5401 /*
5402 * If sync wakeup then subtract the (maximum possible)
5403 * effect of the currently running task from the load
5404 * of the current LLC.
5405 */
5406 if (sync) { 5395 if (sync) {
5407 unsigned long current_load = task_h_load(current); 5396 unsigned long current_load = task_h_load(current);
5408 5397
5409 /* in this case load hits 0 and this LLC is considered 'idle' */ 5398 if (current_load > this_eff_load)
5410 if (current_load > this_stats.load)
5411 return true; 5399 return true;
5412 5400
5413 this_stats.load -= current_load; 5401 this_eff_load -= current_load;
5414 } 5402 }
5415 5403
5416 /*
5417 * The has_capacity stuff is not SMT aware, but by trying to balance
5418 * the nr_running on both ends we try and fill the domain at equal
5419 * rates, thereby first consuming cores before siblings.
5420 */
5421
5422 /* if the old cache has capacity, stay there */
5423 if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424 return false;
5425
5426 /* if this cache has capacity, come here */
5427 if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428 return true;
5429
5430 /*
5431 * Check to see if we can move the load without causing too much
5432 * imbalance.
5433 */
5434 task_load = task_h_load(p); 5404 task_load = task_h_load(p);
5435 5405
5436 this_eff_load = 100; 5406 this_eff_load += task_load;
5437 this_eff_load *= prev_stats.capacity; 5407 if (sched_feat(WA_BIAS))
5438 5408 this_eff_load *= 100;
5439 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 5409 this_eff_load *= capacity_of(prev_cpu);
5440 prev_eff_load *= this_stats.capacity;
5441 5410
5442 this_eff_load *= this_stats.load + task_load; 5411 prev_eff_load -= task_load;
5443 prev_eff_load *= prev_stats.load - task_load; 5412 if (sched_feat(WA_BIAS))
5413 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5414 prev_eff_load *= capacity_of(this_cpu);
5444 5415
5445 return this_eff_load <= prev_eff_load; 5416 return this_eff_load <= prev_eff_load;
5446} 5417}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449 int prev_cpu, int sync) 5420 int prev_cpu, int sync)
5450{ 5421{
5451 int this_cpu = smp_processor_id(); 5422 int this_cpu = smp_processor_id();
5452 bool affine; 5423 bool affine = false;
5453 5424
5454 /* 5425 if (sched_feat(WA_IDLE) && !affine)
5455 * Default to no affine wakeups; wake_affine() should not effect a task 5426 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5456 * placement the load-balancer feels inclined to undo. The conservative
5457 * option is therefore to not move tasks when they wake up.
5458 */
5459 affine = false;
5460 5427
5461 /* 5428 if (sched_feat(WA_WEIGHT) && !affine)
5462 * If the wakeup is across cache domains, try to evaluate if movement 5429 affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5463 * makes sense, otherwise rely on select_idle_siblings() to do
5464 * placement inside the cache domain.
5465 */
5466 if (!cpus_share_cache(prev_cpu, this_cpu))
5467 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468 5430
5469 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5431 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470 if (affine) { 5432 if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600 */ 7562 */
7601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7563static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602{ 7564{
7603 struct sched_domain_shared *shared = env->sd->shared;
7604 struct sched_domain *child = env->sd->child; 7565 struct sched_domain *child = env->sd->child;
7605 struct sched_group *sg = env->sd->groups; 7566 struct sched_group *sg = env->sd->groups;
7606 struct sg_lb_stats *local = &sds->local_stat; 7567 struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
7672 if (env->dst_rq->rd->overload != overload) 7633 if (env->dst_rq->rd->overload != overload)
7673 env->dst_rq->rd->overload = overload; 7634 env->dst_rq->rd->overload = overload;
7674 } 7635 }
7675
7676 if (!shared)
7677 return;
7678
7679 /*
7680 * Since these are sums over groups they can contain some CPUs
7681 * multiple times for the NUMA domains.
7682 *
7683 * Currently only wake_affine_llc() and find_busiest_group()
7684 * uses these numbers, only the last is affected by this problem.
7685 *
7686 * XXX fix that.
7687 */
7688 WRITE_ONCE(shared->nr_running, sds->total_running);
7689 WRITE_ONCE(shared->load, sds->total_load);
7690 WRITE_ONCE(shared->capacity, sds->total_capacity);
7691} 7636}
7692 7637
7693/** 7638/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
8098 int cpu, balance_cpu = -1; 8043 int cpu, balance_cpu = -1;
8099 8044
8100 /* 8045 /*
8046 * Ensure the balancing environment is consistent; can happen
8047 * when the softirq triggers 'during' hotplug.
8048 */
8049 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8050 return 0;
8051
8052 /*
8101 * In the newly idle case, we will allow all the cpu's 8053 * In the newly idle case, we will allow all the cpu's
8102 * to do the newly idle load balance. 8054 * to do the newly idle load balance.
8103 */ 8055 */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
83 83
84SCHED_FEAT(WA_IDLE, true)
85SCHED_FEAT(WA_WEIGHT, true)
86SCHED_FEAT(WA_BIAS, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..dd7908743dab 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,7 @@
18#include <linux/membarrier.h> 18#include <linux/membarrier.h>
19#include <linux/tick.h> 19#include <linux/tick.h>
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/atomic.h>
21 22
22#include "sched.h" /* for cpu_rq(). */ 23#include "sched.h" /* for cpu_rq(). */
23 24
@@ -26,21 +27,26 @@
26 * except MEMBARRIER_CMD_QUERY. 27 * except MEMBARRIER_CMD_QUERY.
27 */ 28 */
28#define MEMBARRIER_CMD_BITMASK \ 29#define MEMBARRIER_CMD_BITMASK \
29 (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) 30 (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
31 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
30 32
31static void ipi_mb(void *info) 33static void ipi_mb(void *info)
32{ 34{
33 smp_mb(); /* IPIs should be serializing but paranoid. */ 35 smp_mb(); /* IPIs should be serializing but paranoid. */
34} 36}
35 37
36static void membarrier_private_expedited(void) 38static int membarrier_private_expedited(void)
37{ 39{
38 int cpu; 40 int cpu;
39 bool fallback = false; 41 bool fallback = false;
40 cpumask_var_t tmpmask; 42 cpumask_var_t tmpmask;
41 43
44 if (!(atomic_read(&current->mm->membarrier_state)
45 & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
46 return -EPERM;
47
42 if (num_online_cpus() == 1) 48 if (num_online_cpus() == 1)
43 return; 49 return 0;
44 50
45 /* 51 /*
46 * Matches memory barriers around rq->curr modification in 52 * Matches memory barriers around rq->curr modification in
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
94 * rq->curr modification in scheduler. 100 * rq->curr modification in scheduler.
95 */ 101 */
96 smp_mb(); /* exit from system call is not a mb */ 102 smp_mb(); /* exit from system call is not a mb */
103 return 0;
104}
105
106static void membarrier_register_private_expedited(void)
107{
108 struct task_struct *p = current;
109 struct mm_struct *mm = p->mm;
110
111 /*
112 * We need to consider threads belonging to different thread
113 * groups, which use the same mm. (CLONE_VM but not
114 * CLONE_THREAD).
115 */
116 if (atomic_read(&mm->membarrier_state)
117 & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
118 return;
119 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
120 &mm->membarrier_state);
97} 121}
98 122
99/** 123/**
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
144 synchronize_sched(); 168 synchronize_sched();
145 return 0; 169 return 0;
146 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 170 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
147 membarrier_private_expedited(); 171 return membarrier_private_expedited();
172 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
173 membarrier_register_private_expedited();
148 return 0; 174 return 0;
149 default: 175 default:
150 return -EINVAL; 176 return -EINVAL;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index bb3a38005b9c..0ae832e13b97 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
473 return 0; 473 return 0;
474} 474}
475 475
476void __get_seccomp_filter(struct seccomp_filter *filter) 476static void __get_seccomp_filter(struct seccomp_filter *filter)
477{ 477{
478 /* Reference count is bounded by the number of total processes. */ 478 /* Reference count is bounded by the number of total processes. */
479 refcount_inc(&filter->usage); 479 refcount_inc(&filter->usage);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..a2dccfe1acec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,7 @@ enum {
68 * attach_mutex to avoid changing binding state while 68 * attach_mutex to avoid changing binding state while
69 * worker_attach_to_pool() is in progress. 69 * worker_attach_to_pool() is in progress.
70 */ 70 */
71 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 72 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
72 73
73 /* worker flags */ 74 /* worker flags */
@@ -165,7 +166,6 @@ struct worker_pool {
165 /* L: hash of busy workers */ 166 /* L: hash of busy workers */
166 167
167 /* see manage_workers() for details on the two manager mutexes */ 168 /* see manage_workers() for details on the two manager mutexes */
168 struct mutex manager_arb; /* manager arbitration */
169 struct worker *manager; /* L: purely informational */ 169 struct worker *manager; /* L: purely informational */
170 struct mutex attach_mutex; /* attach/detach exclusion */ 170 struct mutex attach_mutex; /* attach/detach exclusion */
171 struct list_head workers; /* A: attached workers */ 171 struct list_head workers; /* A: attached workers */
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
299 299
300static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 300static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
301static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 301static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
302static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
302 303
303static LIST_HEAD(workqueues); /* PR: list of all workqueues */ 304static LIST_HEAD(workqueues); /* PR: list of all workqueues */
304static bool workqueue_freezing; /* PL: have wqs started freezing? */ 305static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
801/* Do we have too many workers and should some go away? */ 802/* Do we have too many workers and should some go away? */
802static bool too_many_workers(struct worker_pool *pool) 803static bool too_many_workers(struct worker_pool *pool)
803{ 804{
804 bool managing = mutex_is_locked(&pool->manager_arb); 805 bool managing = pool->flags & POOL_MANAGER_ACTIVE;
805 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 806 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
806 int nr_busy = pool->nr_workers - nr_idle; 807 int nr_busy = pool->nr_workers - nr_idle;
807 808
@@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker)
1980{ 1981{
1981 struct worker_pool *pool = worker->pool; 1982 struct worker_pool *pool = worker->pool;
1982 1983
1983 /* 1984 if (pool->flags & POOL_MANAGER_ACTIVE)
1984 * Anyone who successfully grabs manager_arb wins the arbitration
1985 * and becomes the manager. mutex_trylock() on pool->manager_arb
1986 * failure while holding pool->lock reliably indicates that someone
1987 * else is managing the pool and the worker which failed trylock
1988 * can proceed to executing work items. This means that anyone
1989 * grabbing manager_arb is responsible for actually performing
1990 * manager duties. If manager_arb is grabbed and released without
1991 * actual management, the pool may stall indefinitely.
1992 */
1993 if (!mutex_trylock(&pool->manager_arb))
1994 return false; 1985 return false;
1986
1987 pool->flags |= POOL_MANAGER_ACTIVE;
1995 pool->manager = worker; 1988 pool->manager = worker;
1996 1989
1997 maybe_create_worker(pool); 1990 maybe_create_worker(pool);
1998 1991
1999 pool->manager = NULL; 1992 pool->manager = NULL;
2000 mutex_unlock(&pool->manager_arb); 1993 pool->flags &= ~POOL_MANAGER_ACTIVE;
1994 wake_up(&wq_manager_wait);
2001 return true; 1995 return true;
2002} 1996}
2003 1997
@@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool)
3248 setup_timer(&pool->mayday_timer, pool_mayday_timeout, 3242 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3249 (unsigned long)pool); 3243 (unsigned long)pool);
3250 3244
3251 mutex_init(&pool->manager_arb);
3252 mutex_init(&pool->attach_mutex); 3245 mutex_init(&pool->attach_mutex);
3253 INIT_LIST_HEAD(&pool->workers); 3246 INIT_LIST_HEAD(&pool->workers);
3254 3247
@@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool)
3318 hash_del(&pool->hash_node); 3311 hash_del(&pool->hash_node);
3319 3312
3320 /* 3313 /*
3321 * Become the manager and destroy all workers. Grabbing 3314 * Become the manager and destroy all workers. This prevents
3322 * manager_arb prevents @pool's workers from blocking on 3315 * @pool's workers from blocking on attach_mutex. We're the last
3323 * attach_mutex. 3316 * manager and @pool gets freed with the flag set.
3324 */ 3317 */
3325 mutex_lock(&pool->manager_arb);
3326
3327 spin_lock_irq(&pool->lock); 3318 spin_lock_irq(&pool->lock);
3319 wait_event_lock_irq(wq_manager_wait,
3320 !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
3321 pool->flags |= POOL_MANAGER_ACTIVE;
3322
3328 while ((worker = first_idle_worker(pool))) 3323 while ((worker = first_idle_worker(pool)))
3329 destroy_worker(worker); 3324 destroy_worker(worker);
3330 WARN_ON(pool->nr_workers || pool->nr_idle); 3325 WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool)
3338 if (pool->detach_completion) 3333 if (pool->detach_completion)
3339 wait_for_completion(pool->detach_completion); 3334 wait_for_completion(pool->detach_completion);
3340 3335
3341 mutex_unlock(&pool->manager_arb);
3342
3343 /* shut down the timers */ 3336 /* shut down the timers */
3344 del_timer_sync(&pool->idle_timer); 3337 del_timer_sync(&pool->idle_timer);
3345 del_timer_sync(&pool->mayday_timer); 3338 del_timer_sync(&pool->mayday_timer);