diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 2 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 1 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 43 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 70 | ||||
-rw-r--r-- | kernel/cpu.c | 5 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/exit.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/irq/cpuhotplug.c | 28 | ||||
-rw-r--r-- | kernel/irq/generic-chip.c | 15 | ||||
-rw-r--r-- | kernel/irq/manage.c | 17 | ||||
-rw-r--r-- | kernel/livepatch/core.c | 60 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 48 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 2 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 9 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 18 | ||||
-rw-r--r-- | kernel/sched/fair.c | 140 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 34 | ||||
-rw-r--r-- | kernel/seccomp.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 37 |
24 files changed, 345 insertions, 225 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
98 | array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); | 98 | array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); |
99 | 99 | ||
100 | if (array_size >= U32_MAX - PAGE_SIZE || | 100 | if (array_size >= U32_MAX - PAGE_SIZE || |
101 | elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { | 101 | bpf_array_alloc_percpu(array)) { |
102 | bpf_map_area_free(array); | 102 | bpf_map_area_free(array); |
103 | return ERR_PTR(-ENOMEM); | 103 | return ERR_PTR(-ENOMEM); |
104 | } | 104 | } |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..e745d6a88224 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
@@ -69,7 +69,7 @@ static LIST_HEAD(dev_map_list); | |||
69 | 69 | ||
70 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) | 70 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
71 | { | 71 | { |
72 | return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); | 72 | return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
73 | } | 73 | } |
74 | 74 | ||
75 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | 75 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
@@ -78,6 +78,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
78 | int err = -EINVAL; | 78 | int err = -EINVAL; |
79 | u64 cost; | 79 | u64 cost; |
80 | 80 | ||
81 | if (!capable(CAP_NET_ADMIN)) | ||
82 | return ERR_PTR(-EPERM); | ||
83 | |||
81 | /* check sanity of attributes */ | 84 | /* check sanity of attributes */ |
82 | if (attr->max_entries == 0 || attr->key_size != 4 || | 85 | if (attr->max_entries == 0 || attr->key_size != 4 || |
83 | attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) | 86 | attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) |
@@ -111,8 +114,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
111 | err = -ENOMEM; | 114 | err = -ENOMEM; |
112 | 115 | ||
113 | /* A per cpu bitfield with a bit per possible net device */ | 116 | /* A per cpu bitfield with a bit per possible net device */ |
114 | dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), | 117 | dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
115 | __alignof__(unsigned long)); | 118 | __alignof__(unsigned long), |
119 | GFP_KERNEL | __GFP_NOWARN); | ||
116 | if (!dtab->flush_needed) | 120 | if (!dtab->flush_needed) |
117 | goto free_dtab; | 121 | goto free_dtab; |
118 | 122 | ||
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
317 | */ | 317 | */ |
318 | goto free_htab; | 318 | goto free_htab; |
319 | 319 | ||
320 | if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) | ||
321 | /* make sure the size for pcpu_alloc() is reasonable */ | ||
322 | goto free_htab; | ||
323 | |||
324 | htab->elem_size = sizeof(struct htab_elem) + | 320 | htab->elem_size = sizeof(struct htab_elem) + |
325 | round_up(htab->map.key_size, 8); | 321 | round_up(htab->map.key_size, 8); |
326 | if (percpu) | 322 | if (percpu) |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -363,6 +363,7 @@ out: | |||
363 | putname(pname); | 363 | putname(pname); |
364 | return ret; | 364 | return ret; |
365 | } | 365 | } |
366 | EXPORT_SYMBOL_GPL(bpf_obj_get_user); | ||
366 | 367 | ||
367 | static void bpf_evict_inode(struct inode *inode) | 368 | static void bpf_evict_inode(struct inode *inode) |
368 | { | 369 | { |
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..66f00a2b27f4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/workqueue.h> | 39 | #include <linux/workqueue.h> |
40 | #include <linux/list.h> | 40 | #include <linux/list.h> |
41 | #include <net/strparser.h> | 41 | #include <net/strparser.h> |
42 | #include <net/tcp.h> | ||
42 | 43 | ||
43 | struct bpf_stab { | 44 | struct bpf_stab { |
44 | struct bpf_map map; | 45 | struct bpf_map map; |
@@ -92,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) | |||
92 | return rcu_dereference_sk_user_data(sk); | 93 | return rcu_dereference_sk_user_data(sk); |
93 | } | 94 | } |
94 | 95 | ||
96 | /* compute the linear packet data range [data, data_end) for skb when | ||
97 | * sk_skb type programs are in use. | ||
98 | */ | ||
99 | static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) | ||
100 | { | ||
101 | TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); | ||
102 | } | ||
103 | |||
95 | static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) | 104 | static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) |
96 | { | 105 | { |
97 | struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); | 106 | struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); |
@@ -101,12 +110,20 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) | |||
101 | return SK_DROP; | 110 | return SK_DROP; |
102 | 111 | ||
103 | skb_orphan(skb); | 112 | skb_orphan(skb); |
113 | /* We need to ensure that BPF metadata for maps is also cleared | ||
114 | * when we orphan the skb so that we don't have the possibility | ||
115 | * to reference a stale map. | ||
116 | */ | ||
117 | TCP_SKB_CB(skb)->bpf.map = NULL; | ||
104 | skb->sk = psock->sock; | 118 | skb->sk = psock->sock; |
105 | bpf_compute_data_end(skb); | 119 | bpf_compute_data_end_sk_skb(skb); |
120 | preempt_disable(); | ||
106 | rc = (*prog->bpf_func)(skb, prog->insnsi); | 121 | rc = (*prog->bpf_func)(skb, prog->insnsi); |
122 | preempt_enable(); | ||
107 | skb->sk = NULL; | 123 | skb->sk = NULL; |
108 | 124 | ||
109 | return rc; | 125 | return rc == SK_PASS ? |
126 | (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP; | ||
110 | } | 127 | } |
111 | 128 | ||
112 | static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) | 129 | static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) |
@@ -114,17 +131,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) | |||
114 | struct sock *sk; | 131 | struct sock *sk; |
115 | int rc; | 132 | int rc; |
116 | 133 | ||
117 | /* Because we use per cpu values to feed input from sock redirect | ||
118 | * in BPF program to do_sk_redirect_map() call we need to ensure we | ||
119 | * are not preempted. RCU read lock is not sufficient in this case | ||
120 | * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. | ||
121 | */ | ||
122 | preempt_disable(); | ||
123 | rc = smap_verdict_func(psock, skb); | 134 | rc = smap_verdict_func(psock, skb); |
124 | switch (rc) { | 135 | switch (rc) { |
125 | case SK_REDIRECT: | 136 | case SK_REDIRECT: |
126 | sk = do_sk_redirect_map(); | 137 | sk = do_sk_redirect_map(skb); |
127 | preempt_enable(); | ||
128 | if (likely(sk)) { | 138 | if (likely(sk)) { |
129 | struct smap_psock *peer = smap_psock_sk(sk); | 139 | struct smap_psock *peer = smap_psock_sk(sk); |
130 | 140 | ||
@@ -141,8 +151,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) | |||
141 | /* Fall through and free skb otherwise */ | 151 | /* Fall through and free skb otherwise */ |
142 | case SK_DROP: | 152 | case SK_DROP: |
143 | default: | 153 | default: |
144 | if (rc != SK_REDIRECT) | ||
145 | preempt_enable(); | ||
146 | kfree_skb(skb); | 154 | kfree_skb(skb); |
147 | } | 155 | } |
148 | } | 156 | } |
@@ -369,7 +377,7 @@ static int smap_parse_func_strparser(struct strparser *strp, | |||
369 | * any socket yet. | 377 | * any socket yet. |
370 | */ | 378 | */ |
371 | skb->sk = psock->sock; | 379 | skb->sk = psock->sock; |
372 | bpf_compute_data_end(skb); | 380 | bpf_compute_data_end_sk_skb(skb); |
373 | rc = (*prog->bpf_func)(skb, prog->insnsi); | 381 | rc = (*prog->bpf_func)(skb, prog->insnsi); |
374 | skb->sk = NULL; | 382 | skb->sk = NULL; |
375 | rcu_read_unlock(); | 383 | rcu_read_unlock(); |
@@ -487,6 +495,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) | |||
487 | int err = -EINVAL; | 495 | int err = -EINVAL; |
488 | u64 cost; | 496 | u64 cost; |
489 | 497 | ||
498 | if (!capable(CAP_NET_ADMIN)) | ||
499 | return ERR_PTR(-EPERM); | ||
500 | |||
490 | /* check sanity of attributes */ | 501 | /* check sanity of attributes */ |
491 | if (attr->max_entries == 0 || attr->key_size != 4 || | 502 | if (attr->max_entries == 0 || attr->key_size != 4 || |
492 | attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) | 503 | attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) |
@@ -840,6 +851,12 @@ static int sock_map_update_elem(struct bpf_map *map, | |||
840 | return -EINVAL; | 851 | return -EINVAL; |
841 | } | 852 | } |
842 | 853 | ||
854 | if (skops.sk->sk_type != SOCK_STREAM || | ||
855 | skops.sk->sk_protocol != IPPROTO_TCP) { | ||
856 | fput(socket->file); | ||
857 | return -EOPNOTSUPP; | ||
858 | } | ||
859 | |||
843 | err = sock_map_ctx_update_elem(&skops, map, key, flags); | 860 | err = sock_map_ctx_update_elem(&skops, map, key, flags); |
844 | fput(socket->file); | 861 | fput(socket->file); |
845 | return err; | 862 | return err; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..c48ca2a34b5e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) | |||
653 | { | 653 | { |
654 | struct bpf_verifier_state *parent = state->parent; | 654 | struct bpf_verifier_state *parent = state->parent; |
655 | 655 | ||
656 | if (regno == BPF_REG_FP) | ||
657 | /* We don't need to worry about FP liveness because it's read-only */ | ||
658 | return; | ||
659 | |||
656 | while (parent) { | 660 | while (parent) { |
657 | /* if read wasn't screened by an earlier write ... */ | 661 | /* if read wasn't screened by an earlier write ... */ |
658 | if (state->regs[regno].live & REG_LIVE_WRITTEN) | 662 | if (state->regs[regno].live & REG_LIVE_WRITTEN) |
@@ -1112,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
1112 | /* ctx accesses must be at a fixed offset, so that we can | 1116 | /* ctx accesses must be at a fixed offset, so that we can |
1113 | * determine what type of data were returned. | 1117 | * determine what type of data were returned. |
1114 | */ | 1118 | */ |
1115 | if (!tnum_is_const(reg->var_off)) { | 1119 | if (reg->off) { |
1120 | verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", | ||
1121 | regno, reg->off, off - reg->off); | ||
1122 | return -EACCES; | ||
1123 | } | ||
1124 | if (!tnum_is_const(reg->var_off) || reg->var_off.value) { | ||
1116 | char tn_buf[48]; | 1125 | char tn_buf[48]; |
1117 | 1126 | ||
1118 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); | 1127 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); |
@@ -1120,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
1120 | tn_buf, off, size); | 1129 | tn_buf, off, size); |
1121 | return -EACCES; | 1130 | return -EACCES; |
1122 | } | 1131 | } |
1123 | off += reg->var_off.value; | ||
1124 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); | 1132 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); |
1125 | if (!err && t == BPF_READ && value_regno >= 0) { | 1133 | if (!err && t == BPF_READ && value_regno >= 0) { |
1126 | /* ctx access returns either a scalar, or a | 1134 | /* ctx access returns either a scalar, or a |
@@ -2345,6 +2353,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2345 | * copy register state to dest reg | 2353 | * copy register state to dest reg |
2346 | */ | 2354 | */ |
2347 | regs[insn->dst_reg] = regs[insn->src_reg]; | 2355 | regs[insn->dst_reg] = regs[insn->src_reg]; |
2356 | regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; | ||
2348 | } else { | 2357 | } else { |
2349 | /* R1 = (u32) R2 */ | 2358 | /* R1 = (u32) R2 */ |
2350 | if (is_pointer_value(env, insn->src_reg)) { | 2359 | if (is_pointer_value(env, insn->src_reg)) { |
@@ -2421,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2421 | } | 2430 | } |
2422 | 2431 | ||
2423 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, | 2432 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, |
2424 | struct bpf_reg_state *dst_reg) | 2433 | struct bpf_reg_state *dst_reg, |
2434 | bool range_right_open) | ||
2425 | { | 2435 | { |
2426 | struct bpf_reg_state *regs = state->regs, *reg; | 2436 | struct bpf_reg_state *regs = state->regs, *reg; |
2437 | u16 new_range; | ||
2427 | int i; | 2438 | int i; |
2428 | 2439 | ||
2429 | if (dst_reg->off < 0) | 2440 | if (dst_reg->off < 0 || |
2441 | (dst_reg->off == 0 && range_right_open)) | ||
2430 | /* This doesn't give us any range */ | 2442 | /* This doesn't give us any range */ |
2431 | return; | 2443 | return; |
2432 | 2444 | ||
@@ -2437,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2437 | */ | 2449 | */ |
2438 | return; | 2450 | return; |
2439 | 2451 | ||
2440 | /* LLVM can generate four kind of checks: | 2452 | new_range = dst_reg->off; |
2453 | if (range_right_open) | ||
2454 | new_range--; | ||
2455 | |||
2456 | /* Examples for register markings: | ||
2441 | * | 2457 | * |
2442 | * Type 1/2: | 2458 | * pkt_data in dst register: |
2443 | * | 2459 | * |
2444 | * r2 = r3; | 2460 | * r2 = r3; |
2445 | * r2 += 8; | 2461 | * r2 += 8; |
@@ -2456,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2456 | * r2=pkt(id=n,off=8,r=0) | 2472 | * r2=pkt(id=n,off=8,r=0) |
2457 | * r3=pkt(id=n,off=0,r=0) | 2473 | * r3=pkt(id=n,off=0,r=0) |
2458 | * | 2474 | * |
2459 | * Type 3/4: | 2475 | * pkt_data in src register: |
2460 | * | 2476 | * |
2461 | * r2 = r3; | 2477 | * r2 = r3; |
2462 | * r2 += 8; | 2478 | * r2 += 8; |
@@ -2474,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2474 | * r3=pkt(id=n,off=0,r=0) | 2490 | * r3=pkt(id=n,off=0,r=0) |
2475 | * | 2491 | * |
2476 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | 2492 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) |
2477 | * so that range of bytes [r3, r3 + 8) is safe to access. | 2493 | * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) |
2494 | * and [r3, r3 + 8-1) respectively is safe to access depending on | ||
2495 | * the check. | ||
2478 | */ | 2496 | */ |
2479 | 2497 | ||
2480 | /* If our ids match, then we must have the same max_value. And we | 2498 | /* If our ids match, then we must have the same max_value. And we |
@@ -2485,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
2485 | for (i = 0; i < MAX_BPF_REG; i++) | 2503 | for (i = 0; i < MAX_BPF_REG; i++) |
2486 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) | 2504 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) |
2487 | /* keep the maximum range already checked */ | 2505 | /* keep the maximum range already checked */ |
2488 | regs[i].range = max_t(u16, regs[i].range, dst_reg->off); | 2506 | regs[i].range = max(regs[i].range, new_range); |
2489 | 2507 | ||
2490 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 2508 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
2491 | if (state->stack_slot_type[i] != STACK_SPILL) | 2509 | if (state->stack_slot_type[i] != STACK_SPILL) |
2492 | continue; | 2510 | continue; |
2493 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; | 2511 | reg = &state->spilled_regs[i / BPF_REG_SIZE]; |
2494 | if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) | 2512 | if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) |
2495 | reg->range = max_t(u16, reg->range, dst_reg->off); | 2513 | reg->range = max(reg->range, new_range); |
2496 | } | 2514 | } |
2497 | } | 2515 | } |
2498 | 2516 | ||
@@ -2856,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
2856 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | 2874 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && |
2857 | dst_reg->type == PTR_TO_PACKET && | 2875 | dst_reg->type == PTR_TO_PACKET && |
2858 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2876 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
2859 | find_good_pkt_pointers(this_branch, dst_reg); | 2877 | /* pkt_data' > pkt_end */ |
2878 | find_good_pkt_pointers(this_branch, dst_reg, false); | ||
2879 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | ||
2880 | dst_reg->type == PTR_TO_PACKET_END && | ||
2881 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
2882 | /* pkt_end > pkt_data' */ | ||
2883 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); | ||
2860 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && | 2884 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && |
2861 | dst_reg->type == PTR_TO_PACKET && | 2885 | dst_reg->type == PTR_TO_PACKET && |
2862 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2886 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
2863 | find_good_pkt_pointers(other_branch, dst_reg); | 2887 | /* pkt_data' < pkt_end */ |
2888 | find_good_pkt_pointers(other_branch, dst_reg, true); | ||
2889 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && | ||
2890 | dst_reg->type == PTR_TO_PACKET_END && | ||
2891 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
2892 | /* pkt_end < pkt_data' */ | ||
2893 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); | ||
2894 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | ||
2895 | dst_reg->type == PTR_TO_PACKET && | ||
2896 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | ||
2897 | /* pkt_data' >= pkt_end */ | ||
2898 | find_good_pkt_pointers(this_branch, dst_reg, true); | ||
2864 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | 2899 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && |
2865 | dst_reg->type == PTR_TO_PACKET_END && | 2900 | dst_reg->type == PTR_TO_PACKET_END && |
2866 | regs[insn->src_reg].type == PTR_TO_PACKET) { | 2901 | regs[insn->src_reg].type == PTR_TO_PACKET) { |
2867 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); | 2902 | /* pkt_end >= pkt_data' */ |
2903 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); | ||
2904 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && | ||
2905 | dst_reg->type == PTR_TO_PACKET && | ||
2906 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | ||
2907 | /* pkt_data' <= pkt_end */ | ||
2908 | find_good_pkt_pointers(other_branch, dst_reg, false); | ||
2868 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && | 2909 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && |
2869 | dst_reg->type == PTR_TO_PACKET_END && | 2910 | dst_reg->type == PTR_TO_PACKET_END && |
2870 | regs[insn->src_reg].type == PTR_TO_PACKET) { | 2911 | regs[insn->src_reg].type == PTR_TO_PACKET) { |
2871 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); | 2912 | /* pkt_end <= pkt_data' */ |
2913 | find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); | ||
2872 | } else if (is_pointer_value(env, insn->dst_reg)) { | 2914 | } else if (is_pointer_value(env, insn->dst_reg)) { |
2873 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); | 2915 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); |
2874 | return -EACCES; | 2916 | return -EACCES; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index d851df22f5c5..04892a82f6ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, | |||
632 | __cpuhp_kick_ap(st); | 632 | __cpuhp_kick_ap(st); |
633 | } | 633 | } |
634 | 634 | ||
635 | /* | ||
636 | * Clean up the leftovers so the next hotplug operation wont use stale | ||
637 | * data. | ||
638 | */ | ||
639 | st->node = st->last = NULL; | ||
635 | return ret; | 640 | return ret; |
636 | } | 641 | } |
637 | 642 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..9d93db81fa36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) | |||
662 | /* | 662 | /* |
663 | * Do not update time when cgroup is not active | 663 | * Do not update time when cgroup is not active |
664 | */ | 664 | */ |
665 | if (cgrp == event->cgrp) | 665 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
666 | __update_cgrp_time(event->cgrp); | 666 | __update_cgrp_time(event->cgrp); |
667 | } | 667 | } |
668 | 668 | ||
@@ -8955,6 +8955,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) | |||
8955 | 8955 | ||
8956 | static void free_pmu_context(struct pmu *pmu) | 8956 | static void free_pmu_context(struct pmu *pmu) |
8957 | { | 8957 | { |
8958 | /* | ||
8959 | * Static contexts such as perf_sw_context have a global lifetime | ||
8960 | * and may be shared between different PMUs. Avoid freeing them | ||
8961 | * when a single PMU is going away. | ||
8962 | */ | ||
8963 | if (pmu->task_ctx_nr > perf_invalid_context) | ||
8964 | return; | ||
8965 | |||
8958 | mutex_lock(&pmus_lock); | 8966 | mutex_lock(&pmus_lock); |
8959 | free_percpu(pmu->pmu_cpu_context); | 8967 | free_percpu(pmu->pmu_cpu_context); |
8960 | mutex_unlock(&pmus_lock); | 8968 | mutex_unlock(&pmus_lock); |
diff --git a/kernel/exit.c b/kernel/exit.c index f2cd53e92147..f6cad39f35df 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
1610 | if (!infop) | 1610 | if (!infop) |
1611 | return err; | 1611 | return err; |
1612 | 1612 | ||
1613 | if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) | ||
1614 | return -EFAULT; | ||
1615 | |||
1613 | user_access_begin(); | 1616 | user_access_begin(); |
1614 | unsafe_put_user(signo, &infop->si_signo, Efault); | 1617 | unsafe_put_user(signo, &infop->si_signo, Efault); |
1615 | unsafe_put_user(0, &infop->si_errno, Efault); | 1618 | unsafe_put_user(0, &infop->si_errno, Efault); |
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, | |||
1735 | if (!infop) | 1738 | if (!infop) |
1736 | return err; | 1739 | return err; |
1737 | 1740 | ||
1741 | if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) | ||
1742 | return -EFAULT; | ||
1743 | |||
1738 | user_access_begin(); | 1744 | user_access_begin(); |
1739 | unsafe_put_user(signo, &infop->si_signo, Efault); | 1745 | unsafe_put_user(signo, &infop->si_signo, Efault); |
1740 | unsafe_put_user(0, &infop->si_errno, Efault); | 1746 | unsafe_put_user(0, &infop->si_errno, Efault); |
diff --git a/kernel/fork.c b/kernel/fork.c index e702cb9ffbd8..07cc743698d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
215 | if (!s) | 215 | if (!s) |
216 | continue; | 216 | continue; |
217 | 217 | ||
218 | #ifdef CONFIG_DEBUG_KMEMLEAK | ||
219 | /* Clear stale pointers from reused stack. */ | ||
220 | memset(s->addr, 0, THREAD_SIZE); | ||
221 | #endif | ||
218 | tsk->stack_vm_area = s; | 222 | tsk->stack_vm_area = s; |
219 | return s->addr; | 223 | return s->addr; |
220 | } | 224 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd93824..5a2ef92c2782 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) | |||
265 | irq_setup_affinity(desc); | 265 | irq_setup_affinity(desc); |
266 | break; | 266 | break; |
267 | case IRQ_STARTUP_MANAGED: | 267 | case IRQ_STARTUP_MANAGED: |
268 | irq_do_set_affinity(d, aff, false); | ||
268 | ret = __irq_startup(desc); | 269 | ret = __irq_startup(desc); |
269 | irq_set_affinity_locked(d, aff, false); | ||
270 | break; | 270 | break; |
271 | case IRQ_STARTUP_ABORT: | 271 | case IRQ_STARTUP_ABORT: |
272 | return 0; | 272 | return 0; |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c83d9f..9eb09aef0313 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c | |||
@@ -18,8 +18,34 @@ | |||
18 | static inline bool irq_needs_fixup(struct irq_data *d) | 18 | static inline bool irq_needs_fixup(struct irq_data *d) |
19 | { | 19 | { |
20 | const struct cpumask *m = irq_data_get_effective_affinity_mask(d); | 20 | const struct cpumask *m = irq_data_get_effective_affinity_mask(d); |
21 | unsigned int cpu = smp_processor_id(); | ||
21 | 22 | ||
22 | return cpumask_test_cpu(smp_processor_id(), m); | 23 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK |
24 | /* | ||
25 | * The cpumask_empty() check is a workaround for interrupt chips, | ||
26 | * which do not implement effective affinity, but the architecture has | ||
27 | * enabled the config switch. Use the general affinity mask instead. | ||
28 | */ | ||
29 | if (cpumask_empty(m)) | ||
30 | m = irq_data_get_affinity_mask(d); | ||
31 | |||
32 | /* | ||
33 | * Sanity check. If the mask is not empty when excluding the outgoing | ||
34 | * CPU then it must contain at least one online CPU. The outgoing CPU | ||
35 | * has been removed from the online mask already. | ||
36 | */ | ||
37 | if (cpumask_any_but(m, cpu) < nr_cpu_ids && | ||
38 | cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { | ||
39 | /* | ||
40 | * If this happens then there was a missed IRQ fixup at some | ||
41 | * point. Warn about it and enforce fixup. | ||
42 | */ | ||
43 | pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", | ||
44 | cpumask_pr_args(m), d->irq, cpu); | ||
45 | return true; | ||
46 | } | ||
47 | #endif | ||
48 | return cpumask_test_cpu(cpu, m); | ||
23 | } | 49 | } |
24 | 50 | ||
25 | static bool migrate_one_irq(struct irq_desc *desc) | 51 | static bool migrate_one_irq(struct irq_desc *desc) |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5270a54b9fa4..c26c5bb6b491 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -135,17 +135,26 @@ void irq_gc_ack_clr_bit(struct irq_data *d) | |||
135 | } | 135 | } |
136 | 136 | ||
137 | /** | 137 | /** |
138 | * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt | 138 | * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt |
139 | * @d: irq_data | 139 | * @d: irq_data |
140 | * | ||
141 | * This generic implementation of the irq_mask_ack method is for chips | ||
142 | * with separate enable/disable registers instead of a single mask | ||
143 | * register and where a pending interrupt is acknowledged by setting a | ||
144 | * bit. | ||
145 | * | ||
146 | * Note: This is the only permutation currently used. Similar generic | ||
147 | * functions should be added here if other permutations are required. | ||
140 | */ | 148 | */ |
141 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | 149 | void irq_gc_mask_disable_and_ack_set(struct irq_data *d) |
142 | { | 150 | { |
143 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | 151 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); |
144 | struct irq_chip_type *ct = irq_data_get_chip_type(d); | 152 | struct irq_chip_type *ct = irq_data_get_chip_type(d); |
145 | u32 mask = d->mask; | 153 | u32 mask = d->mask; |
146 | 154 | ||
147 | irq_gc_lock(gc); | 155 | irq_gc_lock(gc); |
148 | irq_reg_writel(gc, mask, ct->regs.mask); | 156 | irq_reg_writel(gc, mask, ct->regs.disable); |
157 | *ct->mask_cache &= ~mask; | ||
149 | irq_reg_writel(gc, mask, ct->regs.ack); | 158 | irq_reg_writel(gc, mask, ct->regs.ack); |
150 | irq_gc_unlock(gc); | 159 | irq_gc_unlock(gc); |
151 | } | 160 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d00132b5c325..4bff6a10ae8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); | 168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); |
169 | } | 169 | } |
170 | 170 | ||
171 | static void irq_validate_effective_affinity(struct irq_data *data) | ||
172 | { | ||
173 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
174 | const struct cpumask *m = irq_data_get_effective_affinity_mask(data); | ||
175 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
176 | |||
177 | if (!cpumask_empty(m)) | ||
178 | return; | ||
179 | pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", | ||
180 | chip->name, data->irq); | ||
181 | #endif | ||
182 | } | ||
183 | |||
171 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | 184 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, |
172 | bool force) | 185 | bool force) |
173 | { | 186 | { |
@@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
175 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 188 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
176 | int ret; | 189 | int ret; |
177 | 190 | ||
191 | if (!chip || !chip->irq_set_affinity) | ||
192 | return -EINVAL; | ||
193 | |||
178 | ret = chip->irq_set_affinity(data, mask, force); | 194 | ret = chip->irq_set_affinity(data, mask, force); |
179 | switch (ret) { | 195 | switch (ret) { |
180 | case IRQ_SET_MASK_OK: | 196 | case IRQ_SET_MASK_OK: |
181 | case IRQ_SET_MASK_OK_DONE: | 197 | case IRQ_SET_MASK_OK_DONE: |
182 | cpumask_copy(desc->irq_common_data.affinity, mask); | 198 | cpumask_copy(desc->irq_common_data.affinity, mask); |
183 | case IRQ_SET_MASK_OK_NOCOPY: | 199 | case IRQ_SET_MASK_OK_NOCOPY: |
200 | irq_validate_effective_affinity(data); | ||
184 | irq_set_thread_affinity(desc); | 201 | irq_set_thread_affinity(desc); |
185 | ret = 0; | 202 | ret = 0; |
186 | } | 203 | } |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..bf8c8fd72589 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch) | |||
830 | } | 830 | } |
831 | EXPORT_SYMBOL_GPL(klp_register_patch); | 831 | EXPORT_SYMBOL_GPL(klp_register_patch); |
832 | 832 | ||
833 | /* | ||
834 | * Remove parts of patches that touch a given kernel module. The list of | ||
835 | * patches processed might be limited. When limit is NULL, all patches | ||
836 | * will be handled. | ||
837 | */ | ||
838 | static void klp_cleanup_module_patches_limited(struct module *mod, | ||
839 | struct klp_patch *limit) | ||
840 | { | ||
841 | struct klp_patch *patch; | ||
842 | struct klp_object *obj; | ||
843 | |||
844 | list_for_each_entry(patch, &klp_patches, list) { | ||
845 | if (patch == limit) | ||
846 | break; | ||
847 | |||
848 | klp_for_each_object(patch, obj) { | ||
849 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | ||
850 | continue; | ||
851 | |||
852 | /* | ||
853 | * Only unpatch the module if the patch is enabled or | ||
854 | * is in transition. | ||
855 | */ | ||
856 | if (patch->enabled || patch == klp_transition_patch) { | ||
857 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | ||
858 | patch->mod->name, obj->mod->name); | ||
859 | klp_unpatch_object(obj); | ||
860 | } | ||
861 | |||
862 | klp_free_object_loaded(obj); | ||
863 | break; | ||
864 | } | ||
865 | } | ||
866 | } | ||
867 | |||
833 | int klp_module_coming(struct module *mod) | 868 | int klp_module_coming(struct module *mod) |
834 | { | 869 | { |
835 | int ret; | 870 | int ret; |
@@ -894,7 +929,7 @@ err: | |||
894 | pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", | 929 | pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", |
895 | patch->mod->name, obj->mod->name, obj->mod->name); | 930 | patch->mod->name, obj->mod->name, obj->mod->name); |
896 | mod->klp_alive = false; | 931 | mod->klp_alive = false; |
897 | klp_free_object_loaded(obj); | 932 | klp_cleanup_module_patches_limited(mod, patch); |
898 | mutex_unlock(&klp_mutex); | 933 | mutex_unlock(&klp_mutex); |
899 | 934 | ||
900 | return ret; | 935 | return ret; |
@@ -902,9 +937,6 @@ err: | |||
902 | 937 | ||
903 | void klp_module_going(struct module *mod) | 938 | void klp_module_going(struct module *mod) |
904 | { | 939 | { |
905 | struct klp_patch *patch; | ||
906 | struct klp_object *obj; | ||
907 | |||
908 | if (WARN_ON(mod->state != MODULE_STATE_GOING && | 940 | if (WARN_ON(mod->state != MODULE_STATE_GOING && |
909 | mod->state != MODULE_STATE_COMING)) | 941 | mod->state != MODULE_STATE_COMING)) |
910 | return; | 942 | return; |
@@ -917,25 +949,7 @@ void klp_module_going(struct module *mod) | |||
917 | */ | 949 | */ |
918 | mod->klp_alive = false; | 950 | mod->klp_alive = false; |
919 | 951 | ||
920 | list_for_each_entry(patch, &klp_patches, list) { | 952 | klp_cleanup_module_patches_limited(mod, NULL); |
921 | klp_for_each_object(patch, obj) { | ||
922 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | ||
923 | continue; | ||
924 | |||
925 | /* | ||
926 | * Only unpatch the module if the patch is enabled or | ||
927 | * is in transition. | ||
928 | */ | ||
929 | if (patch->enabled || patch == klp_transition_patch) { | ||
930 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | ||
931 | patch->mod->name, obj->mod->name); | ||
932 | klp_unpatch_object(obj); | ||
933 | } | ||
934 | |||
935 | klp_free_object_loaded(obj); | ||
936 | break; | ||
937 | } | ||
938 | } | ||
939 | 953 | ||
940 | mutex_unlock(&klp_mutex); | 954 | mutex_unlock(&klp_mutex); |
941 | } | 955 | } |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 44c8d0d17170..e36e652d996f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1873 | struct held_lock *next, int distance, struct stack_trace *trace, | 1873 | struct held_lock *next, int distance, struct stack_trace *trace, |
1874 | int (*save)(struct stack_trace *trace)) | 1874 | int (*save)(struct stack_trace *trace)) |
1875 | { | 1875 | { |
1876 | struct lock_list *uninitialized_var(target_entry); | ||
1876 | struct lock_list *entry; | 1877 | struct lock_list *entry; |
1877 | int ret; | ||
1878 | struct lock_list this; | 1878 | struct lock_list this; |
1879 | struct lock_list *uninitialized_var(target_entry); | 1879 | int ret; |
1880 | 1880 | ||
1881 | /* | 1881 | /* |
1882 | * Prove that the new <prev> -> <next> dependency would not | 1882 | * Prove that the new <prev> -> <next> dependency would not |
@@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1890 | this.class = hlock_class(next); | 1890 | this.class = hlock_class(next); |
1891 | this.parent = NULL; | 1891 | this.parent = NULL; |
1892 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); | 1892 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); |
1893 | if (unlikely(!ret)) | 1893 | if (unlikely(!ret)) { |
1894 | if (!trace->entries) { | ||
1895 | /* | ||
1896 | * If @save fails here, the printing might trigger | ||
1897 | * a WARN but because of the !nr_entries it should | ||
1898 | * not do bad things. | ||
1899 | */ | ||
1900 | save(trace); | ||
1901 | } | ||
1894 | return print_circular_bug(&this, target_entry, next, prev, trace); | 1902 | return print_circular_bug(&this, target_entry, next, prev, trace); |
1903 | } | ||
1895 | else if (unlikely(ret < 0)) | 1904 | else if (unlikely(ret < 0)) |
1896 | return print_bfs_bug(ret); | 1905 | return print_bfs_bug(ret); |
1897 | 1906 | ||
@@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1938 | return print_bfs_bug(ret); | 1947 | return print_bfs_bug(ret); |
1939 | 1948 | ||
1940 | 1949 | ||
1941 | if (save && !save(trace)) | 1950 | if (!trace->entries && !save(trace)) |
1942 | return 0; | 1951 | return 0; |
1943 | 1952 | ||
1944 | /* | 1953 | /* |
@@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
1958 | if (!ret) | 1967 | if (!ret) |
1959 | return 0; | 1968 | return 0; |
1960 | 1969 | ||
1961 | /* | ||
1962 | * Debugging printouts: | ||
1963 | */ | ||
1964 | if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { | ||
1965 | graph_unlock(); | ||
1966 | printk("\n new dependency: "); | ||
1967 | print_lock_name(hlock_class(prev)); | ||
1968 | printk(KERN_CONT " => "); | ||
1969 | print_lock_name(hlock_class(next)); | ||
1970 | printk(KERN_CONT "\n"); | ||
1971 | dump_stack(); | ||
1972 | if (!graph_lock()) | ||
1973 | return 0; | ||
1974 | } | ||
1975 | return 2; | 1970 | return 2; |
1976 | } | 1971 | } |
1977 | 1972 | ||
@@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
1986 | { | 1981 | { |
1987 | int depth = curr->lockdep_depth; | 1982 | int depth = curr->lockdep_depth; |
1988 | struct held_lock *hlock; | 1983 | struct held_lock *hlock; |
1989 | struct stack_trace trace; | 1984 | struct stack_trace trace = { |
1990 | int (*save)(struct stack_trace *trace) = save_trace; | 1985 | .nr_entries = 0, |
1986 | .max_entries = 0, | ||
1987 | .entries = NULL, | ||
1988 | .skip = 0, | ||
1989 | }; | ||
1991 | 1990 | ||
1992 | /* | 1991 | /* |
1993 | * Debugging checks. | 1992 | * Debugging checks. |
@@ -2018,18 +2017,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
2018 | */ | 2017 | */ |
2019 | if (hlock->read != 2 && hlock->check) { | 2018 | if (hlock->read != 2 && hlock->check) { |
2020 | int ret = check_prev_add(curr, hlock, next, | 2019 | int ret = check_prev_add(curr, hlock, next, |
2021 | distance, &trace, save); | 2020 | distance, &trace, save_trace); |
2022 | if (!ret) | 2021 | if (!ret) |
2023 | return 0; | 2022 | return 0; |
2024 | 2023 | ||
2025 | /* | 2024 | /* |
2026 | * Stop saving stack_trace if save_trace() was | ||
2027 | * called at least once: | ||
2028 | */ | ||
2029 | if (save && ret == 2) | ||
2030 | save = NULL; | ||
2031 | |||
2032 | /* | ||
2033 | * Stop after the first non-trylock entry, | 2025 | * Stop after the first non-trylock entry, |
2034 | * as non-trylock entries have added their | 2026 | * as non-trylock entries have added their |
2035 | * own direct dependencies already, so this | 2027 | * own direct dependencies already, so this |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | |||
854 | /** | 854 | /** |
855 | * call_srcu() - Queue a callback for invocation after an SRCU grace period | 855 | * call_srcu() - Queue a callback for invocation after an SRCU grace period |
856 | * @sp: srcu_struct in queue the callback | 856 | * @sp: srcu_struct in queue the callback |
857 | * @head: structure to be used for queueing the SRCU callback. | 857 | * @rhp: structure to be used for queueing the SRCU callback. |
858 | * @func: function to be invoked after the SRCU grace period | 858 | * @func: function to be invoked after the SRCU grace period |
859 | * | 859 | * |
860 | * The callback function will be invoked some time after a full SRCU | 860 | * The callback function will be invoked some time after a full SRCU |
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c | |||
@@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | |||
85 | } | 85 | } |
86 | 86 | ||
87 | /** | 87 | /** |
88 | * rcu_sync_enter_start - Force readers onto slow path for multiple updates | ||
89 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
90 | * | ||
88 | * Must be called after rcu_sync_init() and before first use. | 91 | * Must be called after rcu_sync_init() and before first use. |
89 | * | 92 | * |
90 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() | 93 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() |
@@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) | |||
142 | 145 | ||
143 | /** | 146 | /** |
144 | * rcu_sync_func() - Callback function managing reader access to fastpath | 147 | * rcu_sync_func() - Callback function managing reader access to fastpath |
145 | * @rsp: Pointer to rcu_sync structure to use for synchronization | 148 | * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization |
146 | * | 149 | * |
147 | * This function is passed to one of the call_rcu() functions by | 150 | * This function is passed to one of the call_rcu() functions by |
148 | * rcu_sync_exit(), so that it is invoked after a grace period following the | 151 | * rcu_sync_exit(), so that it is invoked after a grace period following the |
@@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) | |||
158 | * rcu_sync_exit(). Otherwise, set all state back to idle so that readers | 161 | * rcu_sync_exit(). Otherwise, set all state back to idle so that readers |
159 | * can again use their fastpaths. | 162 | * can again use their fastpaths. |
160 | */ | 163 | */ |
161 | static void rcu_sync_func(struct rcu_head *rcu) | 164 | static void rcu_sync_func(struct rcu_head *rhp) |
162 | { | 165 | { |
163 | struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); | 166 | struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); |
164 | unsigned long flags; | 167 | unsigned long flags; |
165 | 168 | ||
166 | BUG_ON(rsp->gp_state != GP_PASSED); | 169 | BUG_ON(rsp->gp_state != GP_PASSED); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3097 | * read-side critical sections have completed. call_rcu_sched() assumes | 3097 | * read-side critical sections have completed. call_rcu_sched() assumes |
3098 | * that the read-side critical sections end on enabling of preemption | 3098 | * that the read-side critical sections end on enabling of preemption |
3099 | * or on voluntary preemption. | 3099 | * or on voluntary preemption. |
3100 | * RCU read-side critical sections are delimited by : | 3100 | * RCU read-side critical sections are delimited by: |
3101 | * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR | 3101 | * |
3102 | * - anything that disables preemption. | 3102 | * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR |
3103 | * - anything that disables preemption. | ||
3103 | * | 3104 | * |
3104 | * These may be nested. | 3105 | * These may be nested. |
3105 | * | 3106 | * |
@@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
3124 | * handler. This means that read-side critical sections in process | 3125 | * handler. This means that read-side critical sections in process |
3125 | * context must not be interrupted by softirqs. This interface is to be | 3126 | * context must not be interrupted by softirqs. This interface is to be |
3126 | * used when most of the read-side critical sections are in softirq context. | 3127 | * used when most of the read-side critical sections are in softirq context. |
3127 | * RCU read-side critical sections are delimited by : | 3128 | * RCU read-side critical sections are delimited by: |
3128 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. | 3129 | * |
3129 | * OR | 3130 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR |
3130 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. | 3131 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. |
3131 | * These may be nested. | 3132 | * |
3133 | * These may be nested. | ||
3132 | * | 3134 | * |
3133 | * See the description of call_rcu() for more detailed information on | 3135 | * See the description of call_rcu() for more detailed information on |
3134 | * memory ordering guarantees. | 3136 | * memory ordering guarantees. |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p) | |||
5356 | return 1; | 5356 | return 1; |
5357 | } | 5357 | } |
5358 | 5358 | ||
5359 | struct llc_stats { | 5359 | /* |
5360 | unsigned long nr_running; | 5360 | * The purpose of wake_affine() is to quickly determine on which CPU we can run |
5361 | unsigned long load; | 5361 | * soonest. For the purpose of speed we only consider the waking and previous |
5362 | unsigned long capacity; | 5362 | * CPU. |
5363 | int has_capacity; | 5363 | * |
5364 | }; | 5364 | * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or |
5365 | * will be) idle. | ||
5366 | * | ||
5367 | * wake_affine_weight() - considers the weight to reflect the average | ||
5368 | * scheduling latency of the CPUs. This seems to work | ||
5369 | * for the overloaded case. | ||
5370 | */ | ||
5365 | 5371 | ||
5366 | static bool get_llc_stats(struct llc_stats *stats, int cpu) | 5372 | static bool |
5373 | wake_affine_idle(struct sched_domain *sd, struct task_struct *p, | ||
5374 | int this_cpu, int prev_cpu, int sync) | ||
5367 | { | 5375 | { |
5368 | struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 5376 | if (idle_cpu(this_cpu)) |
5369 | 5377 | return true; | |
5370 | if (!sds) | ||
5371 | return false; | ||
5372 | 5378 | ||
5373 | stats->nr_running = READ_ONCE(sds->nr_running); | 5379 | if (sync && cpu_rq(this_cpu)->nr_running == 1) |
5374 | stats->load = READ_ONCE(sds->load); | 5380 | return true; |
5375 | stats->capacity = READ_ONCE(sds->capacity); | ||
5376 | stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); | ||
5377 | 5381 | ||
5378 | return true; | 5382 | return false; |
5379 | } | 5383 | } |
5380 | 5384 | ||
5381 | /* | ||
5382 | * Can a task be moved from prev_cpu to this_cpu without causing a load | ||
5383 | * imbalance that would trigger the load balancer? | ||
5384 | * | ||
5385 | * Since we're running on 'stale' values, we might in fact create an imbalance | ||
5386 | * but recomputing these values is expensive, as that'd mean iteration 2 cache | ||
5387 | * domains worth of CPUs. | ||
5388 | */ | ||
5389 | static bool | 5385 | static bool |
5390 | wake_affine_llc(struct sched_domain *sd, struct task_struct *p, | 5386 | wake_affine_weight(struct sched_domain *sd, struct task_struct *p, |
5391 | int this_cpu, int prev_cpu, int sync) | 5387 | int this_cpu, int prev_cpu, int sync) |
5392 | { | 5388 | { |
5393 | struct llc_stats prev_stats, this_stats; | ||
5394 | s64 this_eff_load, prev_eff_load; | 5389 | s64 this_eff_load, prev_eff_load; |
5395 | unsigned long task_load; | 5390 | unsigned long task_load; |
5396 | 5391 | ||
5397 | if (!get_llc_stats(&prev_stats, prev_cpu) || | 5392 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5398 | !get_llc_stats(&this_stats, this_cpu)) | 5393 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); |
5399 | return false; | ||
5400 | 5394 | ||
5401 | /* | ||
5402 | * If sync wakeup then subtract the (maximum possible) | ||
5403 | * effect of the currently running task from the load | ||
5404 | * of the current LLC. | ||
5405 | */ | ||
5406 | if (sync) { | 5395 | if (sync) { |
5407 | unsigned long current_load = task_h_load(current); | 5396 | unsigned long current_load = task_h_load(current); |
5408 | 5397 | ||
5409 | /* in this case load hits 0 and this LLC is considered 'idle' */ | 5398 | if (current_load > this_eff_load) |
5410 | if (current_load > this_stats.load) | ||
5411 | return true; | 5399 | return true; |
5412 | 5400 | ||
5413 | this_stats.load -= current_load; | 5401 | this_eff_load -= current_load; |
5414 | } | 5402 | } |
5415 | 5403 | ||
5416 | /* | ||
5417 | * The has_capacity stuff is not SMT aware, but by trying to balance | ||
5418 | * the nr_running on both ends we try and fill the domain at equal | ||
5419 | * rates, thereby first consuming cores before siblings. | ||
5420 | */ | ||
5421 | |||
5422 | /* if the old cache has capacity, stay there */ | ||
5423 | if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) | ||
5424 | return false; | ||
5425 | |||
5426 | /* if this cache has capacity, come here */ | ||
5427 | if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) | ||
5428 | return true; | ||
5429 | |||
5430 | /* | ||
5431 | * Check to see if we can move the load without causing too much | ||
5432 | * imbalance. | ||
5433 | */ | ||
5434 | task_load = task_h_load(p); | 5404 | task_load = task_h_load(p); |
5435 | 5405 | ||
5436 | this_eff_load = 100; | 5406 | this_eff_load += task_load; |
5437 | this_eff_load *= prev_stats.capacity; | 5407 | if (sched_feat(WA_BIAS)) |
5438 | 5408 | this_eff_load *= 100; | |
5439 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 5409 | this_eff_load *= capacity_of(prev_cpu); |
5440 | prev_eff_load *= this_stats.capacity; | ||
5441 | 5410 | ||
5442 | this_eff_load *= this_stats.load + task_load; | 5411 | prev_eff_load -= task_load; |
5443 | prev_eff_load *= prev_stats.load - task_load; | 5412 | if (sched_feat(WA_BIAS)) |
5413 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | ||
5414 | prev_eff_load *= capacity_of(this_cpu); | ||
5444 | 5415 | ||
5445 | return this_eff_load <= prev_eff_load; | 5416 | return this_eff_load <= prev_eff_load; |
5446 | } | 5417 | } |
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5449 | int prev_cpu, int sync) | 5420 | int prev_cpu, int sync) |
5450 | { | 5421 | { |
5451 | int this_cpu = smp_processor_id(); | 5422 | int this_cpu = smp_processor_id(); |
5452 | bool affine; | 5423 | bool affine = false; |
5453 | 5424 | ||
5454 | /* | 5425 | if (sched_feat(WA_IDLE) && !affine) |
5455 | * Default to no affine wakeups; wake_affine() should not effect a task | 5426 | affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); |
5456 | * placement the load-balancer feels inclined to undo. The conservative | ||
5457 | * option is therefore to not move tasks when they wake up. | ||
5458 | */ | ||
5459 | affine = false; | ||
5460 | 5427 | ||
5461 | /* | 5428 | if (sched_feat(WA_WEIGHT) && !affine) |
5462 | * If the wakeup is across cache domains, try to evaluate if movement | 5429 | affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); |
5463 | * makes sense, otherwise rely on select_idle_siblings() to do | ||
5464 | * placement inside the cache domain. | ||
5465 | */ | ||
5466 | if (!cpus_share_cache(prev_cpu, this_cpu)) | ||
5467 | affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); | ||
5468 | 5430 | ||
5469 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | 5431 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5470 | if (affine) { | 5432 | if (affine) { |
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) | |||
7600 | */ | 7562 | */ |
7601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) | 7563 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
7602 | { | 7564 | { |
7603 | struct sched_domain_shared *shared = env->sd->shared; | ||
7604 | struct sched_domain *child = env->sd->child; | 7565 | struct sched_domain *child = env->sd->child; |
7605 | struct sched_group *sg = env->sd->groups; | 7566 | struct sched_group *sg = env->sd->groups; |
7606 | struct sg_lb_stats *local = &sds->local_stat; | 7567 | struct sg_lb_stats *local = &sds->local_stat; |
@@ -7672,22 +7633,6 @@ next_group: | |||
7672 | if (env->dst_rq->rd->overload != overload) | 7633 | if (env->dst_rq->rd->overload != overload) |
7673 | env->dst_rq->rd->overload = overload; | 7634 | env->dst_rq->rd->overload = overload; |
7674 | } | 7635 | } |
7675 | |||
7676 | if (!shared) | ||
7677 | return; | ||
7678 | |||
7679 | /* | ||
7680 | * Since these are sums over groups they can contain some CPUs | ||
7681 | * multiple times for the NUMA domains. | ||
7682 | * | ||
7683 | * Currently only wake_affine_llc() and find_busiest_group() | ||
7684 | * uses these numbers, only the last is affected by this problem. | ||
7685 | * | ||
7686 | * XXX fix that. | ||
7687 | */ | ||
7688 | WRITE_ONCE(shared->nr_running, sds->total_running); | ||
7689 | WRITE_ONCE(shared->load, sds->total_load); | ||
7690 | WRITE_ONCE(shared->capacity, sds->total_capacity); | ||
7691 | } | 7636 | } |
7692 | 7637 | ||
7693 | /** | 7638 | /** |
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env) | |||
8098 | int cpu, balance_cpu = -1; | 8043 | int cpu, balance_cpu = -1; |
8099 | 8044 | ||
8100 | /* | 8045 | /* |
8046 | * Ensure the balancing environment is consistent; can happen | ||
8047 | * when the softirq triggers 'during' hotplug. | ||
8048 | */ | ||
8049 | if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) | ||
8050 | return 0; | ||
8051 | |||
8052 | /* | ||
8101 | * In the newly idle case, we will allow all the cpu's | 8053 | * In the newly idle case, we will allow all the cpu's |
8102 | * to do the newly idle load balance. | 8054 | * to do the newly idle load balance. |
8103 | */ | 8055 | */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) | |||
81 | SCHED_FEAT(LB_MIN, false) | 81 | SCHED_FEAT(LB_MIN, false) |
82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) |
83 | 83 | ||
84 | SCHED_FEAT(WA_IDLE, true) | ||
85 | SCHED_FEAT(WA_WEIGHT, true) | ||
86 | SCHED_FEAT(WA_BIAS, true) | ||
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/membarrier.h> | 18 | #include <linux/membarrier.h> |
19 | #include <linux/tick.h> | 19 | #include <linux/tick.h> |
20 | #include <linux/cpumask.h> | 20 | #include <linux/cpumask.h> |
21 | #include <linux/atomic.h> | ||
21 | 22 | ||
22 | #include "sched.h" /* for cpu_rq(). */ | 23 | #include "sched.h" /* for cpu_rq(). */ |
23 | 24 | ||
@@ -26,21 +27,26 @@ | |||
26 | * except MEMBARRIER_CMD_QUERY. | 27 | * except MEMBARRIER_CMD_QUERY. |
27 | */ | 28 | */ |
28 | #define MEMBARRIER_CMD_BITMASK \ | 29 | #define MEMBARRIER_CMD_BITMASK \ |
29 | (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) | 30 | (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
31 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) | ||
30 | 32 | ||
31 | static void ipi_mb(void *info) | 33 | static void ipi_mb(void *info) |
32 | { | 34 | { |
33 | smp_mb(); /* IPIs should be serializing but paranoid. */ | 35 | smp_mb(); /* IPIs should be serializing but paranoid. */ |
34 | } | 36 | } |
35 | 37 | ||
36 | static void membarrier_private_expedited(void) | 38 | static int membarrier_private_expedited(void) |
37 | { | 39 | { |
38 | int cpu; | 40 | int cpu; |
39 | bool fallback = false; | 41 | bool fallback = false; |
40 | cpumask_var_t tmpmask; | 42 | cpumask_var_t tmpmask; |
41 | 43 | ||
44 | if (!(atomic_read(¤t->mm->membarrier_state) | ||
45 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) | ||
46 | return -EPERM; | ||
47 | |||
42 | if (num_online_cpus() == 1) | 48 | if (num_online_cpus() == 1) |
43 | return; | 49 | return 0; |
44 | 50 | ||
45 | /* | 51 | /* |
46 | * Matches memory barriers around rq->curr modification in | 52 | * Matches memory barriers around rq->curr modification in |
@@ -94,6 +100,24 @@ static void membarrier_private_expedited(void) | |||
94 | * rq->curr modification in scheduler. | 100 | * rq->curr modification in scheduler. |
95 | */ | 101 | */ |
96 | smp_mb(); /* exit from system call is not a mb */ | 102 | smp_mb(); /* exit from system call is not a mb */ |
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static void membarrier_register_private_expedited(void) | ||
107 | { | ||
108 | struct task_struct *p = current; | ||
109 | struct mm_struct *mm = p->mm; | ||
110 | |||
111 | /* | ||
112 | * We need to consider threads belonging to different thread | ||
113 | * groups, which use the same mm. (CLONE_VM but not | ||
114 | * CLONE_THREAD). | ||
115 | */ | ||
116 | if (atomic_read(&mm->membarrier_state) | ||
117 | & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) | ||
118 | return; | ||
119 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, | ||
120 | &mm->membarrier_state); | ||
97 | } | 121 | } |
98 | 122 | ||
99 | /** | 123 | /** |
@@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) | |||
144 | synchronize_sched(); | 168 | synchronize_sched(); |
145 | return 0; | 169 | return 0; |
146 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: | 170 | case MEMBARRIER_CMD_PRIVATE_EXPEDITED: |
147 | membarrier_private_expedited(); | 171 | return membarrier_private_expedited(); |
172 | case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: | ||
173 | membarrier_register_private_expedited(); | ||
148 | return 0; | 174 | return 0; |
149 | default: | 175 | default: |
150 | return -EINVAL; | 176 | return -EINVAL; |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb3a38005b9c..0ae832e13b97 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags, | |||
473 | return 0; | 473 | return 0; |
474 | } | 474 | } |
475 | 475 | ||
476 | void __get_seccomp_filter(struct seccomp_filter *filter) | 476 | static void __get_seccomp_filter(struct seccomp_filter *filter) |
477 | { | 477 | { |
478 | /* Reference count is bounded by the number of total processes. */ | 478 | /* Reference count is bounded by the number of total processes. */ |
479 | refcount_inc(&filter->usage); | 479 | refcount_inc(&filter->usage); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..a2dccfe1acec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -68,6 +68,7 @@ enum { | |||
68 | * attach_mutex to avoid changing binding state while | 68 | * attach_mutex to avoid changing binding state while |
69 | * worker_attach_to_pool() is in progress. | 69 | * worker_attach_to_pool() is in progress. |
70 | */ | 70 | */ |
71 | POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ | ||
71 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 72 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ |
72 | 73 | ||
73 | /* worker flags */ | 74 | /* worker flags */ |
@@ -165,7 +166,6 @@ struct worker_pool { | |||
165 | /* L: hash of busy workers */ | 166 | /* L: hash of busy workers */ |
166 | 167 | ||
167 | /* see manage_workers() for details on the two manager mutexes */ | 168 | /* see manage_workers() for details on the two manager mutexes */ |
168 | struct mutex manager_arb; /* manager arbitration */ | ||
169 | struct worker *manager; /* L: purely informational */ | 169 | struct worker *manager; /* L: purely informational */ |
170 | struct mutex attach_mutex; /* attach/detach exclusion */ | 170 | struct mutex attach_mutex; /* attach/detach exclusion */ |
171 | struct list_head workers; /* A: attached workers */ | 171 | struct list_head workers; /* A: attached workers */ |
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; | |||
299 | 299 | ||
300 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | 300 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ |
301 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | 301 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ |
302 | static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ | ||
302 | 303 | ||
303 | static LIST_HEAD(workqueues); /* PR: list of all workqueues */ | 304 | static LIST_HEAD(workqueues); /* PR: list of all workqueues */ |
304 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | 305 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ |
@@ -801,7 +802,7 @@ static bool need_to_create_worker(struct worker_pool *pool) | |||
801 | /* Do we have too many workers and should some go away? */ | 802 | /* Do we have too many workers and should some go away? */ |
802 | static bool too_many_workers(struct worker_pool *pool) | 803 | static bool too_many_workers(struct worker_pool *pool) |
803 | { | 804 | { |
804 | bool managing = mutex_is_locked(&pool->manager_arb); | 805 | bool managing = pool->flags & POOL_MANAGER_ACTIVE; |
805 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 806 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
806 | int nr_busy = pool->nr_workers - nr_idle; | 807 | int nr_busy = pool->nr_workers - nr_idle; |
807 | 808 | ||
@@ -1980,24 +1981,17 @@ static bool manage_workers(struct worker *worker) | |||
1980 | { | 1981 | { |
1981 | struct worker_pool *pool = worker->pool; | 1982 | struct worker_pool *pool = worker->pool; |
1982 | 1983 | ||
1983 | /* | 1984 | if (pool->flags & POOL_MANAGER_ACTIVE) |
1984 | * Anyone who successfully grabs manager_arb wins the arbitration | ||
1985 | * and becomes the manager. mutex_trylock() on pool->manager_arb | ||
1986 | * failure while holding pool->lock reliably indicates that someone | ||
1987 | * else is managing the pool and the worker which failed trylock | ||
1988 | * can proceed to executing work items. This means that anyone | ||
1989 | * grabbing manager_arb is responsible for actually performing | ||
1990 | * manager duties. If manager_arb is grabbed and released without | ||
1991 | * actual management, the pool may stall indefinitely. | ||
1992 | */ | ||
1993 | if (!mutex_trylock(&pool->manager_arb)) | ||
1994 | return false; | 1985 | return false; |
1986 | |||
1987 | pool->flags |= POOL_MANAGER_ACTIVE; | ||
1995 | pool->manager = worker; | 1988 | pool->manager = worker; |
1996 | 1989 | ||
1997 | maybe_create_worker(pool); | 1990 | maybe_create_worker(pool); |
1998 | 1991 | ||
1999 | pool->manager = NULL; | 1992 | pool->manager = NULL; |
2000 | mutex_unlock(&pool->manager_arb); | 1993 | pool->flags &= ~POOL_MANAGER_ACTIVE; |
1994 | wake_up(&wq_manager_wait); | ||
2001 | return true; | 1995 | return true; |
2002 | } | 1996 | } |
2003 | 1997 | ||
@@ -3248,7 +3242,6 @@ static int init_worker_pool(struct worker_pool *pool) | |||
3248 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, | 3242 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, |
3249 | (unsigned long)pool); | 3243 | (unsigned long)pool); |
3250 | 3244 | ||
3251 | mutex_init(&pool->manager_arb); | ||
3252 | mutex_init(&pool->attach_mutex); | 3245 | mutex_init(&pool->attach_mutex); |
3253 | INIT_LIST_HEAD(&pool->workers); | 3246 | INIT_LIST_HEAD(&pool->workers); |
3254 | 3247 | ||
@@ -3318,13 +3311,15 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
3318 | hash_del(&pool->hash_node); | 3311 | hash_del(&pool->hash_node); |
3319 | 3312 | ||
3320 | /* | 3313 | /* |
3321 | * Become the manager and destroy all workers. Grabbing | 3314 | * Become the manager and destroy all workers. This prevents |
3322 | * manager_arb prevents @pool's workers from blocking on | 3315 | * @pool's workers from blocking on attach_mutex. We're the last |
3323 | * attach_mutex. | 3316 | * manager and @pool gets freed with the flag set. |
3324 | */ | 3317 | */ |
3325 | mutex_lock(&pool->manager_arb); | ||
3326 | |||
3327 | spin_lock_irq(&pool->lock); | 3318 | spin_lock_irq(&pool->lock); |
3319 | wait_event_lock_irq(wq_manager_wait, | ||
3320 | !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); | ||
3321 | pool->flags |= POOL_MANAGER_ACTIVE; | ||
3322 | |||
3328 | while ((worker = first_idle_worker(pool))) | 3323 | while ((worker = first_idle_worker(pool))) |
3329 | destroy_worker(worker); | 3324 | destroy_worker(worker); |
3330 | WARN_ON(pool->nr_workers || pool->nr_idle); | 3325 | WARN_ON(pool->nr_workers || pool->nr_idle); |
@@ -3338,8 +3333,6 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
3338 | if (pool->detach_completion) | 3333 | if (pool->detach_completion) |
3339 | wait_for_completion(pool->detach_completion); | 3334 | wait_for_completion(pool->detach_completion); |
3340 | 3335 | ||
3341 | mutex_unlock(&pool->manager_arb); | ||
3342 | |||
3343 | /* shut down the timers */ | 3336 | /* shut down the timers */ |
3344 | del_timer_sync(&pool->idle_timer); | 3337 | del_timer_sync(&pool->idle_timer); |
3345 | del_timer_sync(&pool->mayday_timer); | 3338 | del_timer_sync(&pool->mayday_timer); |