aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c30
-rw-r--r--kernel/bpf/cgroup.c54
-rw-r--r--kernel/bpf/core.c30
-rw-r--r--kernel/bpf/devmap.c7
-rw-r--r--kernel/bpf/hashtab.c16
-rw-r--r--kernel/bpf/sockmap.c297
-rw-r--r--kernel/bpf/syscall.c103
-rw-r--r--kernel/bpf/verifier.c11
-rw-r--r--kernel/fork.c35
-rw-r--r--kernel/kthread.c30
-rw-r--r--kernel/rseq.c41
-rw-r--r--kernel/sched/core.c67
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/fair.c45
-rw-r--r--kernel/sched/rt.c16
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/time/tick-common.c3
-rw-r--r--kernel/trace/ftrace.c13
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events_filter.c5
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_functions_graph.c5
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_output.c5
26 files changed, 520 insertions, 343 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2d49d18b793a..e016ac3afa24 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf,
991 void *data, u8 bits_offset, 991 void *data, u8 bits_offset,
992 struct seq_file *m) 992 struct seq_file *m)
993{ 993{
994 u16 left_shift_bits, right_shift_bits;
994 u32 int_data = btf_type_int(t); 995 u32 int_data = btf_type_int(t);
995 u16 nr_bits = BTF_INT_BITS(int_data); 996 u16 nr_bits = BTF_INT_BITS(int_data);
996 u16 total_bits_offset; 997 u16 total_bits_offset;
997 u16 nr_copy_bytes; 998 u16 nr_copy_bytes;
998 u16 nr_copy_bits; 999 u16 nr_copy_bits;
999 u8 nr_upper_bits; 1000 u64 print_num;
1000 union {
1001 u64 u64_num;
1002 u8 u8_nums[8];
1003 } print_num;
1004 1001
1005 total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); 1002 total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
1006 data += BITS_ROUNDDOWN_BYTES(total_bits_offset); 1003 data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
@@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf,
1008 nr_copy_bits = nr_bits + bits_offset; 1005 nr_copy_bits = nr_bits + bits_offset;
1009 nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); 1006 nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
1010 1007
1011 print_num.u64_num = 0; 1008 print_num = 0;
1012 memcpy(&print_num.u64_num, data, nr_copy_bytes); 1009 memcpy(&print_num, data, nr_copy_bytes);
1013 1010
1014 /* Ditch the higher order bits */ 1011#ifdef __BIG_ENDIAN_BITFIELD
1015 nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); 1012 left_shift_bits = bits_offset;
1016 if (nr_upper_bits) { 1013#else
1017 /* We need to mask out some bits of the upper byte. */ 1014 left_shift_bits = BITS_PER_U64 - nr_copy_bits;
1018 u8 mask = (1 << nr_upper_bits) - 1; 1015#endif
1016 right_shift_bits = BITS_PER_U64 - nr_bits;
1019 1017
1020 print_num.u8_nums[nr_copy_bytes - 1] &= mask; 1018 print_num <<= left_shift_bits;
1021 } 1019 print_num >>= right_shift_bits;
1022
1023 print_num.u64_num >>= bits_offset;
1024 1020
1025 seq_printf(m, "0x%llx", print_num.u64_num); 1021 seq_printf(m, "0x%llx", print_num);
1026} 1022}
1027 1023
1028static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, 1024static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..3d83ee7df381 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
428 return ret; 428 return ret;
429} 429}
430 430
431int cgroup_bpf_prog_attach(const union bpf_attr *attr,
432 enum bpf_prog_type ptype, struct bpf_prog *prog)
433{
434 struct cgroup *cgrp;
435 int ret;
436
437 cgrp = cgroup_get_from_fd(attr->target_fd);
438 if (IS_ERR(cgrp))
439 return PTR_ERR(cgrp);
440
441 ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
442 attr->attach_flags);
443 cgroup_put(cgrp);
444 return ret;
445}
446
447int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
448{
449 struct bpf_prog *prog;
450 struct cgroup *cgrp;
451 int ret;
452
453 cgrp = cgroup_get_from_fd(attr->target_fd);
454 if (IS_ERR(cgrp))
455 return PTR_ERR(cgrp);
456
457 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
458 if (IS_ERR(prog))
459 prog = NULL;
460
461 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
462 if (prog)
463 bpf_prog_put(prog);
464
465 cgroup_put(cgrp);
466 return ret;
467}
468
469int cgroup_bpf_prog_query(const union bpf_attr *attr,
470 union bpf_attr __user *uattr)
471{
472 struct cgroup *cgrp;
473 int ret;
474
475 cgrp = cgroup_get_from_fd(attr->query.target_fd);
476 if (IS_ERR(cgrp))
477 return PTR_ERR(cgrp);
478
479 ret = cgroup_bpf_query(cgrp, attr, uattr);
480
481 cgroup_put(cgrp);
482 return ret;
483}
484
431/** 485/**
432 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 486 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
433 * @sk: The socket sending or receiving traffic 487 * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
598 bpf_fill_ill_insns(hdr, size); 598 bpf_fill_ill_insns(hdr, size);
599 599
600 hdr->pages = size / PAGE_SIZE; 600 hdr->pages = size / PAGE_SIZE;
601 hdr->locked = 0;
602
603 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 601 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
604 PAGE_SIZE - sizeof(*hdr)); 602 PAGE_SIZE - sizeof(*hdr));
605 start = (get_random_int() % hole) & ~(alignment - 1); 603 start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1450 return 0; 1448 return 0;
1451} 1449}
1452 1450
1453static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
1454{
1455#ifdef CONFIG_ARCH_HAS_SET_MEMORY
1456 int i, err;
1457
1458 for (i = 0; i < fp->aux->func_cnt; i++) {
1459 err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
1460 if (err)
1461 return err;
1462 }
1463
1464 return bpf_prog_check_pages_ro_single(fp);
1465#endif
1466 return 0;
1467}
1468
1469static void bpf_prog_select_func(struct bpf_prog *fp) 1451static void bpf_prog_select_func(struct bpf_prog *fp)
1470{ 1452{
1471#ifndef CONFIG_BPF_JIT_ALWAYS_ON 1453#ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
1524 * all eBPF JITs might immediately support all features. 1506 * all eBPF JITs might immediately support all features.
1525 */ 1507 */
1526 *err = bpf_check_tail_call(fp); 1508 *err = bpf_check_tail_call(fp);
1527 if (*err) 1509
1528 return fp;
1529
1530 /* Checkpoint: at this point onwards any cBPF -> eBPF or
1531 * native eBPF program is read-only. If we failed to change
1532 * the page attributes (e.g. allocation failure from
1533 * splitting large pages), then reject the whole program
1534 * in order to guarantee not ending up with any W+X pages
1535 * from BPF side in kernel.
1536 */
1537 *err = bpf_prog_check_pages_ro_locked(fp);
1538 return fp; 1510 return fp;
1539} 1511}
1540EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); 1512EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642c97f6d1b8..d361fc1e3bf3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
334{ 334{
335 struct net_device *dev = dst->dev; 335 struct net_device *dev = dst->dev;
336 struct xdp_frame *xdpf; 336 struct xdp_frame *xdpf;
337 int err;
337 338
338 if (!dev->netdev_ops->ndo_xdp_xmit) 339 if (!dev->netdev_ops->ndo_xdp_xmit)
339 return -EOPNOTSUPP; 340 return -EOPNOTSUPP;
340 341
342 err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
343 if (unlikely(err))
344 return err;
345
341 xdpf = convert_to_xdp_frame(xdp); 346 xdpf = convert_to_xdp_frame(xdp);
342 if (unlikely(!xdpf)) 347 if (unlikely(!xdpf))
343 return -EOVERFLOW; 348 return -EOVERFLOW;
@@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
350{ 355{
351 int err; 356 int err;
352 357
353 err = __xdp_generic_ok_fwd_dev(skb, dst->dev); 358 err = xdp_ok_fwd_dev(dst->dev, skb->len);
354 if (unlikely(err)) 359 if (unlikely(err))
355 return err; 360 return err;
356 skb->dev = dst->dev; 361 skb->dev = dst->dev;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ca2198a6d22..513d9dfcf4ee 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
747 * old element will be freed immediately. 747 * old element will be freed immediately.
748 * Otherwise return an error 748 * Otherwise return an error
749 */ 749 */
750 atomic_dec(&htab->count); 750 l_new = ERR_PTR(-E2BIG);
751 return ERR_PTR(-E2BIG); 751 goto dec_count;
752 } 752 }
753 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 753 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
754 htab->map.numa_node); 754 htab->map.numa_node);
755 if (!l_new) 755 if (!l_new) {
756 return ERR_PTR(-ENOMEM); 756 l_new = ERR_PTR(-ENOMEM);
757 goto dec_count;
758 }
757 } 759 }
758 760
759 memcpy(l_new->key, key, key_size); 761 memcpy(l_new->key, key, key_size);
@@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
766 GFP_ATOMIC | __GFP_NOWARN); 768 GFP_ATOMIC | __GFP_NOWARN);
767 if (!pptr) { 769 if (!pptr) {
768 kfree(l_new); 770 kfree(l_new);
769 return ERR_PTR(-ENOMEM); 771 l_new = ERR_PTR(-ENOMEM);
772 goto dec_count;
770 } 773 }
771 } 774 }
772 775
@@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
780 783
781 l_new->hash = hash; 784 l_new->hash = hash;
782 return l_new; 785 return l_new;
786dec_count:
787 atomic_dec(&htab->count);
788 return l_new;
783} 789}
784 790
785static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 791static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d816c0e..98fb7938beea 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -72,6 +72,7 @@ struct bpf_htab {
72 u32 n_buckets; 72 u32 n_buckets;
73 u32 elem_size; 73 u32 elem_size;
74 struct bpf_sock_progs progs; 74 struct bpf_sock_progs progs;
75 struct rcu_head rcu;
75}; 76};
76 77
77struct htab_elem { 78struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
89struct smap_psock_map_entry { 90struct smap_psock_map_entry {
90 struct list_head list; 91 struct list_head list;
91 struct sock **entry; 92 struct sock **entry;
92 struct htab_elem *hash_link; 93 struct htab_elem __rcu *hash_link;
93 struct bpf_htab *htab; 94 struct bpf_htab __rcu *htab;
94}; 95};
95 96
96struct smap_psock { 97struct smap_psock {
@@ -120,6 +121,7 @@ struct smap_psock {
120 struct bpf_prog *bpf_parse; 121 struct bpf_prog *bpf_parse;
121 struct bpf_prog *bpf_verdict; 122 struct bpf_prog *bpf_verdict;
122 struct list_head maps; 123 struct list_head maps;
124 spinlock_t maps_lock;
123 125
124 /* Back reference used when sock callback trigger sockmap operations */ 126 /* Back reference used when sock callback trigger sockmap operations */
125 struct sock *sock; 127 struct sock *sock;
@@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
140static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); 142static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
141static int bpf_tcp_sendpage(struct sock *sk, struct page *page, 143static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
142 int offset, size_t size, int flags); 144 int offset, size_t size, int flags);
145static void bpf_tcp_close(struct sock *sk, long timeout);
143 146
144static inline struct smap_psock *smap_psock_sk(const struct sock *sk) 147static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
145{ 148{
@@ -161,7 +164,42 @@ out:
161 return !empty; 164 return !empty;
162} 165}
163 166
164static struct proto tcp_bpf_proto; 167enum {
168 SOCKMAP_IPV4,
169 SOCKMAP_IPV6,
170 SOCKMAP_NUM_PROTS,
171};
172
173enum {
174 SOCKMAP_BASE,
175 SOCKMAP_TX,
176 SOCKMAP_NUM_CONFIGS,
177};
178
179static struct proto *saved_tcpv6_prot __read_mostly;
180static DEFINE_SPINLOCK(tcpv6_prot_lock);
181static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
182static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
183 struct proto *base)
184{
185 prot[SOCKMAP_BASE] = *base;
186 prot[SOCKMAP_BASE].close = bpf_tcp_close;
187 prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
188 prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
189
190 prot[SOCKMAP_TX] = prot[SOCKMAP_BASE];
191 prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg;
192 prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage;
193}
194
195static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
196{
197 int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
198 int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
199
200 sk->sk_prot = &bpf_tcp_prots[family][conf];
201}
202
165static int bpf_tcp_init(struct sock *sk) 203static int bpf_tcp_init(struct sock *sk)
166{ 204{
167 struct smap_psock *psock; 205 struct smap_psock *psock;
@@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk)
181 psock->save_close = sk->sk_prot->close; 219 psock->save_close = sk->sk_prot->close;
182 psock->sk_proto = sk->sk_prot; 220 psock->sk_proto = sk->sk_prot;
183 221
184 if (psock->bpf_tx_msg) { 222 /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
185 tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; 223 if (sk->sk_family == AF_INET6 &&
186 tcp_bpf_proto.sendpage = bpf_tcp_sendpage; 224 unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
187 tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; 225 spin_lock_bh(&tcpv6_prot_lock);
188 tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; 226 if (likely(sk->sk_prot != saved_tcpv6_prot)) {
227 build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
228 smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
229 }
230 spin_unlock_bh(&tcpv6_prot_lock);
189 } 231 }
190 232 update_sk_prot(sk, psock);
191 sk->sk_prot = &tcp_bpf_proto;
192 rcu_read_unlock(); 233 rcu_read_unlock();
193 return 0; 234 return 0;
194} 235}
@@ -219,24 +260,64 @@ out:
219 rcu_read_unlock(); 260 rcu_read_unlock();
220} 261}
221 262
263static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
264 u32 hash, void *key, u32 key_size)
265{
266 struct htab_elem *l;
267
268 hlist_for_each_entry_rcu(l, head, hash_node) {
269 if (l->hash == hash && !memcmp(&l->key, key, key_size))
270 return l;
271 }
272
273 return NULL;
274}
275
276static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
277{
278 return &htab->buckets[hash & (htab->n_buckets - 1)];
279}
280
281static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
282{
283 return &__select_bucket(htab, hash)->head;
284}
285
222static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 286static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
223{ 287{
224 atomic_dec(&htab->count); 288 atomic_dec(&htab->count);
225 kfree_rcu(l, rcu); 289 kfree_rcu(l, rcu);
226} 290}
227 291
292static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
293 struct smap_psock *psock)
294{
295 struct smap_psock_map_entry *e;
296
297 spin_lock_bh(&psock->maps_lock);
298 e = list_first_entry_or_null(&psock->maps,
299 struct smap_psock_map_entry,
300 list);
301 if (e)
302 list_del(&e->list);
303 spin_unlock_bh(&psock->maps_lock);
304 return e;
305}
306
228static void bpf_tcp_close(struct sock *sk, long timeout) 307static void bpf_tcp_close(struct sock *sk, long timeout)
229{ 308{
230 void (*close_fun)(struct sock *sk, long timeout); 309 void (*close_fun)(struct sock *sk, long timeout);
231 struct smap_psock_map_entry *e, *tmp; 310 struct smap_psock_map_entry *e;
232 struct sk_msg_buff *md, *mtmp; 311 struct sk_msg_buff *md, *mtmp;
233 struct smap_psock *psock; 312 struct smap_psock *psock;
234 struct sock *osk; 313 struct sock *osk;
235 314
315 lock_sock(sk);
236 rcu_read_lock(); 316 rcu_read_lock();
237 psock = smap_psock_sk(sk); 317 psock = smap_psock_sk(sk);
238 if (unlikely(!psock)) { 318 if (unlikely(!psock)) {
239 rcu_read_unlock(); 319 rcu_read_unlock();
320 release_sock(sk);
240 return sk->sk_prot->close(sk, timeout); 321 return sk->sk_prot->close(sk, timeout);
241 } 322 }
242 323
@@ -247,7 +328,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
247 */ 328 */
248 close_fun = psock->save_close; 329 close_fun = psock->save_close;
249 330
250 write_lock_bh(&sk->sk_callback_lock);
251 if (psock->cork) { 331 if (psock->cork) {
252 free_start_sg(psock->sock, psock->cork); 332 free_start_sg(psock->sock, psock->cork);
253 kfree(psock->cork); 333 kfree(psock->cork);
@@ -260,21 +340,40 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
260 kfree(md); 340 kfree(md);
261 } 341 }
262 342
263 list_for_each_entry_safe(e, tmp, &psock->maps, list) { 343 e = psock_map_pop(sk, psock);
344 while (e) {
264 if (e->entry) { 345 if (e->entry) {
265 osk = cmpxchg(e->entry, sk, NULL); 346 osk = cmpxchg(e->entry, sk, NULL);
266 if (osk == sk) { 347 if (osk == sk) {
267 list_del(&e->list);
268 smap_release_sock(psock, sk); 348 smap_release_sock(psock, sk);
269 } 349 }
270 } else { 350 } else {
271 hlist_del_rcu(&e->hash_link->hash_node); 351 struct htab_elem *link = rcu_dereference(e->hash_link);
272 smap_release_sock(psock, e->hash_link->sk); 352 struct bpf_htab *htab = rcu_dereference(e->htab);
273 free_htab_elem(e->htab, e->hash_link); 353 struct hlist_head *head;
354 struct htab_elem *l;
355 struct bucket *b;
356
357 b = __select_bucket(htab, link->hash);
358 head = &b->head;
359 raw_spin_lock_bh(&b->lock);
360 l = lookup_elem_raw(head,
361 link->hash, link->key,
362 htab->map.key_size);
363 /* If another thread deleted this object skip deletion.
364 * The refcnt on psock may or may not be zero.
365 */
366 if (l) {
367 hlist_del_rcu(&link->hash_node);
368 smap_release_sock(psock, link->sk);
369 free_htab_elem(htab, link);
370 }
371 raw_spin_unlock_bh(&b->lock);
274 } 372 }
373 e = psock_map_pop(sk, psock);
275 } 374 }
276 write_unlock_bh(&sk->sk_callback_lock);
277 rcu_read_unlock(); 375 rcu_read_unlock();
376 release_sock(sk);
278 close_fun(sk, timeout); 377 close_fun(sk, timeout);
279} 378}
280 379
@@ -472,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
472 while (sg[i].length) { 571 while (sg[i].length) {
473 free += sg[i].length; 572 free += sg[i].length;
474 sk_mem_uncharge(sk, sg[i].length); 573 sk_mem_uncharge(sk, sg[i].length);
475 put_page(sg_page(&sg[i])); 574 if (!md->skb)
575 put_page(sg_page(&sg[i]));
476 sg[i].length = 0; 576 sg[i].length = 0;
477 sg[i].page_link = 0; 577 sg[i].page_link = 0;
478 sg[i].offset = 0; 578 sg[i].offset = 0;
@@ -481,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
481 if (i == MAX_SKB_FRAGS) 581 if (i == MAX_SKB_FRAGS)
482 i = 0; 582 i = 0;
483 } 583 }
584 if (md->skb)
585 consume_skb(md->skb);
484 586
485 return free; 587 return free;
486} 588}
@@ -1111,8 +1213,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
1111 1213
1112static int bpf_tcp_ulp_register(void) 1214static int bpf_tcp_ulp_register(void)
1113{ 1215{
1114 tcp_bpf_proto = tcp_prot; 1216 build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
1115 tcp_bpf_proto.close = bpf_tcp_close;
1116 /* Once BPF TX ULP is registered it is never unregistered. It 1217 /* Once BPF TX ULP is registered it is never unregistered. It
1117 * will be in the ULP list for the lifetime of the system. Doing 1218 * will be in the ULP list for the lifetime of the system. Doing
1118 * duplicate registers is not a problem. 1219 * duplicate registers is not a problem.
@@ -1135,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
1135 */ 1236 */
1136 TCP_SKB_CB(skb)->bpf.sk_redir = NULL; 1237 TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
1137 skb->sk = psock->sock; 1238 skb->sk = psock->sock;
1138 bpf_compute_data_pointers(skb); 1239 bpf_compute_data_end_sk_skb(skb);
1139 preempt_disable(); 1240 preempt_disable();
1140 rc = (*prog->bpf_func)(skb, prog->insnsi); 1241 rc = (*prog->bpf_func)(skb, prog->insnsi);
1141 preempt_enable(); 1242 preempt_enable();
@@ -1357,7 +1458,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
1357{ 1458{
1358 if (refcount_dec_and_test(&psock->refcnt)) { 1459 if (refcount_dec_and_test(&psock->refcnt)) {
1359 tcp_cleanup_ulp(sock); 1460 tcp_cleanup_ulp(sock);
1461 write_lock_bh(&sock->sk_callback_lock);
1360 smap_stop_sock(psock, sock); 1462 smap_stop_sock(psock, sock);
1463 write_unlock_bh(&sock->sk_callback_lock);
1361 clear_bit(SMAP_TX_RUNNING, &psock->state); 1464 clear_bit(SMAP_TX_RUNNING, &psock->state);
1362 rcu_assign_sk_user_data(sock, NULL); 1465 rcu_assign_sk_user_data(sock, NULL);
1363 call_rcu_sched(&psock->rcu, smap_destroy_psock); 1466 call_rcu_sched(&psock->rcu, smap_destroy_psock);
@@ -1388,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
1388 * any socket yet. 1491 * any socket yet.
1389 */ 1492 */
1390 skb->sk = psock->sock; 1493 skb->sk = psock->sock;
1391 bpf_compute_data_pointers(skb); 1494 bpf_compute_data_end_sk_skb(skb);
1392 rc = (*prog->bpf_func)(skb, prog->insnsi); 1495 rc = (*prog->bpf_func)(skb, prog->insnsi);
1393 skb->sk = NULL; 1496 skb->sk = NULL;
1394 rcu_read_unlock(); 1497 rcu_read_unlock();
@@ -1508,6 +1611,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
1508 INIT_LIST_HEAD(&psock->maps); 1611 INIT_LIST_HEAD(&psock->maps);
1509 INIT_LIST_HEAD(&psock->ingress); 1612 INIT_LIST_HEAD(&psock->ingress);
1510 refcount_set(&psock->refcnt, 1); 1613 refcount_set(&psock->refcnt, 1);
1614 spin_lock_init(&psock->maps_lock);
1511 1615
1512 rcu_assign_sk_user_data(sock, psock); 1616 rcu_assign_sk_user_data(sock, psock);
1513 sock_hold(sock); 1617 sock_hold(sock);
@@ -1564,18 +1668,32 @@ free_stab:
1564 return ERR_PTR(err); 1668 return ERR_PTR(err);
1565} 1669}
1566 1670
1567static void smap_list_remove(struct smap_psock *psock, 1671static void smap_list_map_remove(struct smap_psock *psock,
1568 struct sock **entry, 1672 struct sock **entry)
1569 struct htab_elem *hash_link)
1570{ 1673{
1571 struct smap_psock_map_entry *e, *tmp; 1674 struct smap_psock_map_entry *e, *tmp;
1572 1675
1676 spin_lock_bh(&psock->maps_lock);
1573 list_for_each_entry_safe(e, tmp, &psock->maps, list) { 1677 list_for_each_entry_safe(e, tmp, &psock->maps, list) {
1574 if (e->entry == entry || e->hash_link == hash_link) { 1678 if (e->entry == entry)
1679 list_del(&e->list);
1680 }
1681 spin_unlock_bh(&psock->maps_lock);
1682}
1683
1684static void smap_list_hash_remove(struct smap_psock *psock,
1685 struct htab_elem *hash_link)
1686{
1687 struct smap_psock_map_entry *e, *tmp;
1688
1689 spin_lock_bh(&psock->maps_lock);
1690 list_for_each_entry_safe(e, tmp, &psock->maps, list) {
1691 struct htab_elem *c = rcu_dereference(e->hash_link);
1692
1693 if (c == hash_link)
1575 list_del(&e->list); 1694 list_del(&e->list);
1576 break;
1577 }
1578 } 1695 }
1696 spin_unlock_bh(&psock->maps_lock);
1579} 1697}
1580 1698
1581static void sock_map_free(struct bpf_map *map) 1699static void sock_map_free(struct bpf_map *map)
@@ -1601,7 +1719,6 @@ static void sock_map_free(struct bpf_map *map)
1601 if (!sock) 1719 if (!sock)
1602 continue; 1720 continue;
1603 1721
1604 write_lock_bh(&sock->sk_callback_lock);
1605 psock = smap_psock_sk(sock); 1722 psock = smap_psock_sk(sock);
1606 /* This check handles a racing sock event that can get the 1723 /* This check handles a racing sock event that can get the
1607 * sk_callback_lock before this case but after xchg happens 1724 * sk_callback_lock before this case but after xchg happens
@@ -1609,10 +1726,9 @@ static void sock_map_free(struct bpf_map *map)
1609 * to be null and queued for garbage collection. 1726 * to be null and queued for garbage collection.
1610 */ 1727 */
1611 if (likely(psock)) { 1728 if (likely(psock)) {
1612 smap_list_remove(psock, &stab->sock_map[i], NULL); 1729 smap_list_map_remove(psock, &stab->sock_map[i]);
1613 smap_release_sock(psock, sock); 1730 smap_release_sock(psock, sock);
1614 } 1731 }
1615 write_unlock_bh(&sock->sk_callback_lock);
1616 } 1732 }
1617 rcu_read_unlock(); 1733 rcu_read_unlock();
1618 1734
@@ -1661,17 +1777,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
1661 if (!sock) 1777 if (!sock)
1662 return -EINVAL; 1778 return -EINVAL;
1663 1779
1664 write_lock_bh(&sock->sk_callback_lock);
1665 psock = smap_psock_sk(sock); 1780 psock = smap_psock_sk(sock);
1666 if (!psock) 1781 if (!psock)
1667 goto out; 1782 goto out;
1668 1783
1669 if (psock->bpf_parse) 1784 if (psock->bpf_parse)
1670 smap_stop_sock(psock, sock); 1785 smap_stop_sock(psock, sock);
1671 smap_list_remove(psock, &stab->sock_map[k], NULL); 1786 smap_list_map_remove(psock, &stab->sock_map[k]);
1672 smap_release_sock(psock, sock); 1787 smap_release_sock(psock, sock);
1673out: 1788out:
1674 write_unlock_bh(&sock->sk_callback_lock);
1675 return 0; 1789 return 0;
1676} 1790}
1677 1791
@@ -1752,7 +1866,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
1752 } 1866 }
1753 } 1867 }
1754 1868
1755 write_lock_bh(&sock->sk_callback_lock);
1756 psock = smap_psock_sk(sock); 1869 psock = smap_psock_sk(sock);
1757 1870
1758 /* 2. Do not allow inheriting programs if psock exists and has 1871 /* 2. Do not allow inheriting programs if psock exists and has
@@ -1789,7 +1902,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
1789 e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); 1902 e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
1790 if (!e) { 1903 if (!e) {
1791 err = -ENOMEM; 1904 err = -ENOMEM;
1792 goto out_progs; 1905 goto out_free;
1793 } 1906 }
1794 } 1907 }
1795 1908
@@ -1809,7 +1922,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
1809 if (err) 1922 if (err)
1810 goto out_free; 1923 goto out_free;
1811 smap_init_progs(psock, verdict, parse); 1924 smap_init_progs(psock, verdict, parse);
1925 write_lock_bh(&sock->sk_callback_lock);
1812 smap_start_sock(psock, sock); 1926 smap_start_sock(psock, sock);
1927 write_unlock_bh(&sock->sk_callback_lock);
1813 } 1928 }
1814 1929
1815 /* 4. Place psock in sockmap for use and stop any programs on 1930 /* 4. Place psock in sockmap for use and stop any programs on
@@ -1819,9 +1934,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
1819 */ 1934 */
1820 if (map_link) { 1935 if (map_link) {
1821 e->entry = map_link; 1936 e->entry = map_link;
1937 spin_lock_bh(&psock->maps_lock);
1822 list_add_tail(&e->list, &psock->maps); 1938 list_add_tail(&e->list, &psock->maps);
1939 spin_unlock_bh(&psock->maps_lock);
1823 } 1940 }
1824 write_unlock_bh(&sock->sk_callback_lock);
1825 return err; 1941 return err;
1826out_free: 1942out_free:
1827 smap_release_sock(psock, sock); 1943 smap_release_sock(psock, sock);
@@ -1832,7 +1948,6 @@ out_progs:
1832 } 1948 }
1833 if (tx_msg) 1949 if (tx_msg)
1834 bpf_prog_put(tx_msg); 1950 bpf_prog_put(tx_msg);
1835 write_unlock_bh(&sock->sk_callback_lock);
1836 kfree(e); 1951 kfree(e);
1837 return err; 1952 return err;
1838} 1953}
@@ -1869,10 +1984,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1869 if (osock) { 1984 if (osock) {
1870 struct smap_psock *opsock = smap_psock_sk(osock); 1985 struct smap_psock *opsock = smap_psock_sk(osock);
1871 1986
1872 write_lock_bh(&osock->sk_callback_lock); 1987 smap_list_map_remove(opsock, &stab->sock_map[i]);
1873 smap_list_remove(opsock, &stab->sock_map[i], NULL);
1874 smap_release_sock(opsock, osock); 1988 smap_release_sock(opsock, osock);
1875 write_unlock_bh(&osock->sk_callback_lock);
1876 } 1989 }
1877out: 1990out:
1878 return err; 1991 return err;
@@ -1915,6 +2028,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
1915 return 0; 2028 return 0;
1916} 2029}
1917 2030
2031int sockmap_get_from_fd(const union bpf_attr *attr, int type,
2032 struct bpf_prog *prog)
2033{
2034 int ufd = attr->target_fd;
2035 struct bpf_map *map;
2036 struct fd f;
2037 int err;
2038
2039 f = fdget(ufd);
2040 map = __bpf_map_get(f);
2041 if (IS_ERR(map))
2042 return PTR_ERR(map);
2043
2044 err = sock_map_prog(map, prog, attr->attach_type);
2045 fdput(f);
2046 return err;
2047}
2048
1918static void *sock_map_lookup(struct bpf_map *map, void *key) 2049static void *sock_map_lookup(struct bpf_map *map, void *key)
1919{ 2050{
1920 return NULL; 2051 return NULL;
@@ -1944,7 +2075,13 @@ static int sock_map_update_elem(struct bpf_map *map,
1944 return -EOPNOTSUPP; 2075 return -EOPNOTSUPP;
1945 } 2076 }
1946 2077
2078 lock_sock(skops.sk);
2079 preempt_disable();
2080 rcu_read_lock();
1947 err = sock_map_ctx_update_elem(&skops, map, key, flags); 2081 err = sock_map_ctx_update_elem(&skops, map, key, flags);
2082 rcu_read_unlock();
2083 preempt_enable();
2084 release_sock(skops.sk);
1948 fput(socket->file); 2085 fput(socket->file);
1949 return err; 2086 return err;
1950} 2087}
@@ -2043,14 +2180,13 @@ free_htab:
2043 return ERR_PTR(err); 2180 return ERR_PTR(err);
2044} 2181}
2045 2182
2046static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 2183static void __bpf_htab_free(struct rcu_head *rcu)
2047{ 2184{
2048 return &htab->buckets[hash & (htab->n_buckets - 1)]; 2185 struct bpf_htab *htab;
2049}
2050 2186
2051static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) 2187 htab = container_of(rcu, struct bpf_htab, rcu);
2052{ 2188 bpf_map_area_free(htab->buckets);
2053 return &__select_bucket(htab, hash)->head; 2189 kfree(htab);
2054} 2190}
2055 2191
2056static void sock_hash_free(struct bpf_map *map) 2192static void sock_hash_free(struct bpf_map *map)
@@ -2069,16 +2205,18 @@ static void sock_hash_free(struct bpf_map *map)
2069 */ 2205 */
2070 rcu_read_lock(); 2206 rcu_read_lock();
2071 for (i = 0; i < htab->n_buckets; i++) { 2207 for (i = 0; i < htab->n_buckets; i++) {
2072 struct hlist_head *head = select_bucket(htab, i); 2208 struct bucket *b = __select_bucket(htab, i);
2209 struct hlist_head *head;
2073 struct hlist_node *n; 2210 struct hlist_node *n;
2074 struct htab_elem *l; 2211 struct htab_elem *l;
2075 2212
2213 raw_spin_lock_bh(&b->lock);
2214 head = &b->head;
2076 hlist_for_each_entry_safe(l, n, head, hash_node) { 2215 hlist_for_each_entry_safe(l, n, head, hash_node) {
2077 struct sock *sock = l->sk; 2216 struct sock *sock = l->sk;
2078 struct smap_psock *psock; 2217 struct smap_psock *psock;
2079 2218
2080 hlist_del_rcu(&l->hash_node); 2219 hlist_del_rcu(&l->hash_node);
2081 write_lock_bh(&sock->sk_callback_lock);
2082 psock = smap_psock_sk(sock); 2220 psock = smap_psock_sk(sock);
2083 /* This check handles a racing sock event that can get 2221 /* This check handles a racing sock event that can get
2084 * the sk_callback_lock before this case but after xchg 2222 * the sk_callback_lock before this case but after xchg
@@ -2086,16 +2224,15 @@ static void sock_hash_free(struct bpf_map *map)
2086 * (psock) to be null and queued for garbage collection. 2224 * (psock) to be null and queued for garbage collection.
2087 */ 2225 */
2088 if (likely(psock)) { 2226 if (likely(psock)) {
2089 smap_list_remove(psock, NULL, l); 2227 smap_list_hash_remove(psock, l);
2090 smap_release_sock(psock, sock); 2228 smap_release_sock(psock, sock);
2091 } 2229 }
2092 write_unlock_bh(&sock->sk_callback_lock); 2230 free_htab_elem(htab, l);
2093 kfree(l);
2094 } 2231 }
2232 raw_spin_unlock_bh(&b->lock);
2095 } 2233 }
2096 rcu_read_unlock(); 2234 rcu_read_unlock();
2097 bpf_map_area_free(htab->buckets); 2235 call_rcu(&htab->rcu, __bpf_htab_free);
2098 kfree(htab);
2099} 2236}
2100 2237
2101static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, 2238static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2122,19 +2259,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
2122 return l_new; 2259 return l_new;
2123} 2260}
2124 2261
2125static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
2126 u32 hash, void *key, u32 key_size)
2127{
2128 struct htab_elem *l;
2129
2130 hlist_for_each_entry_rcu(l, head, hash_node) {
2131 if (l->hash == hash && !memcmp(&l->key, key, key_size))
2132 return l;
2133 }
2134
2135 return NULL;
2136}
2137
2138static inline u32 htab_map_hash(const void *key, u32 key_len) 2262static inline u32 htab_map_hash(const void *key, u32 key_len)
2139{ 2263{
2140 return jhash(key, key_len, 0); 2264 return jhash(key, key_len, 0);
@@ -2230,7 +2354,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
2230 if (err) 2354 if (err)
2231 goto err; 2355 goto err;
2232 2356
2233 /* bpf_map_update_elem() can be called in_irq() */ 2357 /* psock is valid here because otherwise above *ctx_update_elem would
2358 * have thrown an error. It is safe to skip error check.
2359 */
2360 psock = smap_psock_sk(sock);
2234 raw_spin_lock_bh(&b->lock); 2361 raw_spin_lock_bh(&b->lock);
2235 l_old = lookup_elem_raw(head, hash, key, key_size); 2362 l_old = lookup_elem_raw(head, hash, key, key_size);
2236 if (l_old && map_flags == BPF_NOEXIST) { 2363 if (l_old && map_flags == BPF_NOEXIST) {
@@ -2248,15 +2375,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
2248 goto bucket_err; 2375 goto bucket_err;
2249 } 2376 }
2250 2377
2251 psock = smap_psock_sk(sock); 2378 rcu_assign_pointer(e->hash_link, l_new);
2252 if (unlikely(!psock)) { 2379 rcu_assign_pointer(e->htab,
2253 err = -EINVAL; 2380 container_of(map, struct bpf_htab, map));
2254 goto bucket_err; 2381 spin_lock_bh(&psock->maps_lock);
2255 }
2256
2257 e->hash_link = l_new;
2258 e->htab = container_of(map, struct bpf_htab, map);
2259 list_add_tail(&e->list, &psock->maps); 2382 list_add_tail(&e->list, &psock->maps);
2383 spin_unlock_bh(&psock->maps_lock);
2260 2384
2261 /* add new element to the head of the list, so that 2385 /* add new element to the head of the list, so that
2262 * concurrent search will find it before old elem 2386 * concurrent search will find it before old elem
@@ -2266,19 +2390,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
2266 psock = smap_psock_sk(l_old->sk); 2390 psock = smap_psock_sk(l_old->sk);
2267 2391
2268 hlist_del_rcu(&l_old->hash_node); 2392 hlist_del_rcu(&l_old->hash_node);
2269 smap_list_remove(psock, NULL, l_old); 2393 smap_list_hash_remove(psock, l_old);
2270 smap_release_sock(psock, l_old->sk); 2394 smap_release_sock(psock, l_old->sk);
2271 free_htab_elem(htab, l_old); 2395 free_htab_elem(htab, l_old);
2272 } 2396 }
2273 raw_spin_unlock_bh(&b->lock); 2397 raw_spin_unlock_bh(&b->lock);
2274 return 0; 2398 return 0;
2275bucket_err: 2399bucket_err:
2400 smap_release_sock(psock, sock);
2276 raw_spin_unlock_bh(&b->lock); 2401 raw_spin_unlock_bh(&b->lock);
2277err: 2402err:
2278 kfree(e); 2403 kfree(e);
2279 psock = smap_psock_sk(sock);
2280 if (psock)
2281 smap_release_sock(psock, sock);
2282 return err; 2404 return err;
2283} 2405}
2284 2406
@@ -2300,7 +2422,13 @@ static int sock_hash_update_elem(struct bpf_map *map,
2300 return -EINVAL; 2422 return -EINVAL;
2301 } 2423 }
2302 2424
2425 lock_sock(skops.sk);
2426 preempt_disable();
2427 rcu_read_lock();
2303 err = sock_hash_ctx_update_elem(&skops, map, key, flags); 2428 err = sock_hash_ctx_update_elem(&skops, map, key, flags);
2429 rcu_read_unlock();
2430 preempt_enable();
2431 release_sock(skops.sk);
2304 fput(socket->file); 2432 fput(socket->file);
2305 return err; 2433 return err;
2306} 2434}
@@ -2326,7 +2454,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
2326 struct smap_psock *psock; 2454 struct smap_psock *psock;
2327 2455
2328 hlist_del_rcu(&l->hash_node); 2456 hlist_del_rcu(&l->hash_node);
2329 write_lock_bh(&sock->sk_callback_lock);
2330 psock = smap_psock_sk(sock); 2457 psock = smap_psock_sk(sock);
2331 /* This check handles a racing sock event that can get the 2458 /* This check handles a racing sock event that can get the
2332 * sk_callback_lock before this case but after xchg happens 2459 * sk_callback_lock before this case but after xchg happens
@@ -2334,10 +2461,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
2334 * to be null and queued for garbage collection. 2461 * to be null and queued for garbage collection.
2335 */ 2462 */
2336 if (likely(psock)) { 2463 if (likely(psock)) {
2337 smap_list_remove(psock, NULL, l); 2464 smap_list_hash_remove(psock, l);
2338 smap_release_sock(psock, sock); 2465 smap_release_sock(psock, sock);
2339 } 2466 }
2340 write_unlock_bh(&sock->sk_callback_lock);
2341 free_htab_elem(htab, l); 2467 free_htab_elem(htab, l);
2342 ret = 0; 2468 ret = 0;
2343 } 2469 }
@@ -2359,10 +2485,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
2359 b = __select_bucket(htab, hash); 2485 b = __select_bucket(htab, hash);
2360 head = &b->head; 2486 head = &b->head;
2361 2487
2362 raw_spin_lock_bh(&b->lock);
2363 l = lookup_elem_raw(head, hash, key, key_size); 2488 l = lookup_elem_raw(head, hash, key, key_size);
2364 sk = l ? l->sk : NULL; 2489 sk = l ? l->sk : NULL;
2365 raw_spin_unlock_bh(&b->lock);
2366 return sk; 2490 return sk;
2367} 2491}
2368 2492
@@ -2383,6 +2507,7 @@ const struct bpf_map_ops sock_hash_ops = {
2383 .map_get_next_key = sock_hash_get_next_key, 2507 .map_get_next_key = sock_hash_get_next_key,
2384 .map_update_elem = sock_hash_update_elem, 2508 .map_update_elem = sock_hash_update_elem,
2385 .map_delete_elem = sock_hash_delete_elem, 2509 .map_delete_elem = sock_hash_delete_elem,
2510 .map_release_uref = sock_map_release,
2386}; 2511};
2387 2512
2388BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, 2513BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..a31a1ba0f8ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr)
735 if (bpf_map_is_dev_bound(map)) { 735 if (bpf_map_is_dev_bound(map)) {
736 err = bpf_map_offload_update_elem(map, key, value, attr->flags); 736 err = bpf_map_offload_update_elem(map, key, value, attr->flags);
737 goto out; 737 goto out;
738 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 738 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
739 map->map_type == BPF_MAP_TYPE_SOCKHASH ||
740 map->map_type == BPF_MAP_TYPE_SOCKMAP) {
739 err = map->ops->map_update_elem(map, key, value, attr->flags); 741 err = map->ops->map_update_elem(map, key, value, attr->flags);
740 goto out; 742 goto out;
741 } 743 }
@@ -1483,8 +1485,6 @@ out_free_tp:
1483 return err; 1485 return err;
1484} 1486}
1485 1487
1486#ifdef CONFIG_CGROUP_BPF
1487
1488static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, 1488static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1489 enum bpf_attach_type attach_type) 1489 enum bpf_attach_type attach_type)
1490{ 1490{
@@ -1499,40 +1499,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1499 1499
1500#define BPF_PROG_ATTACH_LAST_FIELD attach_flags 1500#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
1501 1501
1502static int sockmap_get_from_fd(const union bpf_attr *attr,
1503 int type, bool attach)
1504{
1505 struct bpf_prog *prog = NULL;
1506 int ufd = attr->target_fd;
1507 struct bpf_map *map;
1508 struct fd f;
1509 int err;
1510
1511 f = fdget(ufd);
1512 map = __bpf_map_get(f);
1513 if (IS_ERR(map))
1514 return PTR_ERR(map);
1515
1516 if (attach) {
1517 prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
1518 if (IS_ERR(prog)) {
1519 fdput(f);
1520 return PTR_ERR(prog);
1521 }
1522 }
1523
1524 err = sock_map_prog(map, prog, attr->attach_type);
1525 if (err) {
1526 fdput(f);
1527 if (prog)
1528 bpf_prog_put(prog);
1529 return err;
1530 }
1531
1532 fdput(f);
1533 return 0;
1534}
1535
1536#define BPF_F_ATTACH_MASK \ 1502#define BPF_F_ATTACH_MASK \
1537 (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) 1503 (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
1538 1504
@@ -1540,7 +1506,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1540{ 1506{
1541 enum bpf_prog_type ptype; 1507 enum bpf_prog_type ptype;
1542 struct bpf_prog *prog; 1508 struct bpf_prog *prog;
1543 struct cgroup *cgrp;
1544 int ret; 1509 int ret;
1545 1510
1546 if (!capable(CAP_NET_ADMIN)) 1511 if (!capable(CAP_NET_ADMIN))
@@ -1577,12 +1542,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1577 ptype = BPF_PROG_TYPE_CGROUP_DEVICE; 1542 ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
1578 break; 1543 break;
1579 case BPF_SK_MSG_VERDICT: 1544 case BPF_SK_MSG_VERDICT:
1580 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); 1545 ptype = BPF_PROG_TYPE_SK_MSG;
1546 break;
1581 case BPF_SK_SKB_STREAM_PARSER: 1547 case BPF_SK_SKB_STREAM_PARSER:
1582 case BPF_SK_SKB_STREAM_VERDICT: 1548 case BPF_SK_SKB_STREAM_VERDICT:
1583 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); 1549 ptype = BPF_PROG_TYPE_SK_SKB;
1550 break;
1584 case BPF_LIRC_MODE2: 1551 case BPF_LIRC_MODE2:
1585 return lirc_prog_attach(attr); 1552 ptype = BPF_PROG_TYPE_LIRC_MODE2;
1553 break;
1586 default: 1554 default:
1587 return -EINVAL; 1555 return -EINVAL;
1588 } 1556 }
@@ -1596,18 +1564,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1596 return -EINVAL; 1564 return -EINVAL;
1597 } 1565 }
1598 1566
1599 cgrp = cgroup_get_from_fd(attr->target_fd); 1567 switch (ptype) {
1600 if (IS_ERR(cgrp)) { 1568 case BPF_PROG_TYPE_SK_SKB:
1601 bpf_prog_put(prog); 1569 case BPF_PROG_TYPE_SK_MSG:
1602 return PTR_ERR(cgrp); 1570 ret = sockmap_get_from_fd(attr, ptype, prog);
1571 break;
1572 case BPF_PROG_TYPE_LIRC_MODE2:
1573 ret = lirc_prog_attach(attr, prog);
1574 break;
1575 default:
1576 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
1603 } 1577 }
1604 1578
1605 ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
1606 attr->attach_flags);
1607 if (ret) 1579 if (ret)
1608 bpf_prog_put(prog); 1580 bpf_prog_put(prog);
1609 cgroup_put(cgrp);
1610
1611 return ret; 1581 return ret;
1612} 1582}
1613 1583
@@ -1616,9 +1586,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1616static int bpf_prog_detach(const union bpf_attr *attr) 1586static int bpf_prog_detach(const union bpf_attr *attr)
1617{ 1587{
1618 enum bpf_prog_type ptype; 1588 enum bpf_prog_type ptype;
1619 struct bpf_prog *prog;
1620 struct cgroup *cgrp;
1621 int ret;
1622 1589
1623 if (!capable(CAP_NET_ADMIN)) 1590 if (!capable(CAP_NET_ADMIN))
1624 return -EPERM; 1591 return -EPERM;
@@ -1651,29 +1618,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1651 ptype = BPF_PROG_TYPE_CGROUP_DEVICE; 1618 ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
1652 break; 1619 break;
1653 case BPF_SK_MSG_VERDICT: 1620 case BPF_SK_MSG_VERDICT:
1654 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); 1621 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
1655 case BPF_SK_SKB_STREAM_PARSER: 1622 case BPF_SK_SKB_STREAM_PARSER:
1656 case BPF_SK_SKB_STREAM_VERDICT: 1623 case BPF_SK_SKB_STREAM_VERDICT:
1657 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); 1624 return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
1658 case BPF_LIRC_MODE2: 1625 case BPF_LIRC_MODE2:
1659 return lirc_prog_detach(attr); 1626 return lirc_prog_detach(attr);
1660 default: 1627 default:
1661 return -EINVAL; 1628 return -EINVAL;
1662 } 1629 }
1663 1630
1664 cgrp = cgroup_get_from_fd(attr->target_fd); 1631 return cgroup_bpf_prog_detach(attr, ptype);
1665 if (IS_ERR(cgrp))
1666 return PTR_ERR(cgrp);
1667
1668 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1669 if (IS_ERR(prog))
1670 prog = NULL;
1671
1672 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
1673 if (prog)
1674 bpf_prog_put(prog);
1675 cgroup_put(cgrp);
1676 return ret;
1677} 1632}
1678 1633
1679#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt 1634#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
@@ -1681,9 +1636,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1681static int bpf_prog_query(const union bpf_attr *attr, 1636static int bpf_prog_query(const union bpf_attr *attr,
1682 union bpf_attr __user *uattr) 1637 union bpf_attr __user *uattr)
1683{ 1638{
1684 struct cgroup *cgrp;
1685 int ret;
1686
1687 if (!capable(CAP_NET_ADMIN)) 1639 if (!capable(CAP_NET_ADMIN))
1688 return -EPERM; 1640 return -EPERM;
1689 if (CHECK_ATTR(BPF_PROG_QUERY)) 1641 if (CHECK_ATTR(BPF_PROG_QUERY))
@@ -1711,14 +1663,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
1711 default: 1663 default:
1712 return -EINVAL; 1664 return -EINVAL;
1713 } 1665 }
1714 cgrp = cgroup_get_from_fd(attr->query.target_fd); 1666
1715 if (IS_ERR(cgrp)) 1667 return cgroup_bpf_prog_query(attr, uattr);
1716 return PTR_ERR(cgrp);
1717 ret = cgroup_bpf_query(cgrp, attr, uattr);
1718 cgroup_put(cgrp);
1719 return ret;
1720} 1668}
1721#endif /* CONFIG_CGROUP_BPF */
1722 1669
1723#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration 1670#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
1724 1671
@@ -2365,7 +2312,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
2365 case BPF_OBJ_GET: 2312 case BPF_OBJ_GET:
2366 err = bpf_obj_get(&attr); 2313 err = bpf_obj_get(&attr);
2367 break; 2314 break;
2368#ifdef CONFIG_CGROUP_BPF
2369 case BPF_PROG_ATTACH: 2315 case BPF_PROG_ATTACH:
2370 err = bpf_prog_attach(&attr); 2316 err = bpf_prog_attach(&attr);
2371 break; 2317 break;
@@ -2375,7 +2321,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
2375 case BPF_PROG_QUERY: 2321 case BPF_PROG_QUERY:
2376 err = bpf_prog_query(&attr, uattr); 2322 err = bpf_prog_query(&attr, uattr);
2377 break; 2323 break;
2378#endif
2379 case BPF_PROG_TEST_RUN: 2324 case BPF_PROG_TEST_RUN:
2380 err = bpf_prog_test_run(&attr, uattr); 2325 err = bpf_prog_test_run(&attr, uattr);
2381 break; 2326 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e2bf834f13a..63aaac52a265 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5430 if (insn->code != (BPF_JMP | BPF_CALL) || 5430 if (insn->code != (BPF_JMP | BPF_CALL) ||
5431 insn->src_reg != BPF_PSEUDO_CALL) 5431 insn->src_reg != BPF_PSEUDO_CALL)
5432 continue; 5432 continue;
5433 /* Upon error here we cannot fall back to interpreter but
5434 * need a hard reject of the program. Thus -EFAULT is
5435 * propagated in any case.
5436 */
5433 subprog = find_subprog(env, i + insn->imm + 1); 5437 subprog = find_subprog(env, i + insn->imm + 1);
5434 if (subprog < 0) { 5438 if (subprog < 0) {
5435 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", 5439 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
@@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5450 5454
5451 func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); 5455 func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
5452 if (!func) 5456 if (!func)
5453 return -ENOMEM; 5457 goto out_undo_insn;
5454 5458
5455 for (i = 0; i < env->subprog_cnt; i++) { 5459 for (i = 0; i < env->subprog_cnt; i++) {
5456 subprog_start = subprog_end; 5460 subprog_start = subprog_end;
@@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
5515 tmp = bpf_int_jit_compile(func[i]); 5519 tmp = bpf_int_jit_compile(func[i]);
5516 if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { 5520 if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
5517 verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); 5521 verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
5518 err = -EFAULT; 5522 err = -ENOTSUPP;
5519 goto out_free; 5523 goto out_free;
5520 } 5524 }
5521 cond_resched(); 5525 cond_resched();
@@ -5552,6 +5556,7 @@ out_free:
5552 if (func[i]) 5556 if (func[i])
5553 bpf_jit_free(func[i]); 5557 bpf_jit_free(func[i]);
5554 kfree(func); 5558 kfree(func);
5559out_undo_insn:
5555 /* cleanup main prog to be interpreted */ 5560 /* cleanup main prog to be interpreted */
5556 prog->jit_requested = 0; 5561 prog->jit_requested = 0;
5557 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { 5562 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
5578 err = jit_subprogs(env); 5583 err = jit_subprogs(env);
5579 if (err == 0) 5584 if (err == 0)
5580 return 0; 5585 return 0;
5586 if (err == -EFAULT)
5587 return err;
5581 } 5588 }
5582#ifndef CONFIG_BPF_JIT_ALWAYS_ON 5589#ifndef CONFIG_BPF_JIT_ALWAYS_ON
5583 for (i = 0; i < prog->len; i++, insn++) { 5590 for (i = 0; i < prog->len; i++, insn++) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..a191c05e757d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -303,11 +303,38 @@ struct kmem_cache *files_cachep;
303struct kmem_cache *fs_cachep; 303struct kmem_cache *fs_cachep;
304 304
305/* SLAB cache for vm_area_struct structures */ 305/* SLAB cache for vm_area_struct structures */
306struct kmem_cache *vm_area_cachep; 306static struct kmem_cache *vm_area_cachep;
307 307
308/* SLAB cache for mm_struct structures (tsk->mm) */ 308/* SLAB cache for mm_struct structures (tsk->mm) */
309static struct kmem_cache *mm_cachep; 309static struct kmem_cache *mm_cachep;
310 310
311struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
312{
313 struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
314
315 if (vma) {
316 vma->vm_mm = mm;
317 INIT_LIST_HEAD(&vma->anon_vma_chain);
318 }
319 return vma;
320}
321
322struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
323{
324 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
325
326 if (new) {
327 *new = *orig;
328 INIT_LIST_HEAD(&new->anon_vma_chain);
329 }
330 return new;
331}
332
333void vm_area_free(struct vm_area_struct *vma)
334{
335 kmem_cache_free(vm_area_cachep, vma);
336}
337
311static void account_kernel_stack(struct task_struct *tsk, int account) 338static void account_kernel_stack(struct task_struct *tsk, int account)
312{ 339{
313 void *stack = task_stack_page(tsk); 340 void *stack = task_stack_page(tsk);
@@ -455,11 +482,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
455 goto fail_nomem; 482 goto fail_nomem;
456 charge = len; 483 charge = len;
457 } 484 }
458 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 485 tmp = vm_area_dup(mpnt);
459 if (!tmp) 486 if (!tmp)
460 goto fail_nomem; 487 goto fail_nomem;
461 *tmp = *mpnt;
462 INIT_LIST_HEAD(&tmp->anon_vma_chain);
463 retval = vma_dup_policy(mpnt, tmp); 488 retval = vma_dup_policy(mpnt, tmp);
464 if (retval) 489 if (retval)
465 goto fail_nomem_policy; 490 goto fail_nomem_policy;
@@ -539,7 +564,7 @@ fail_uprobe_end:
539fail_nomem_anon_vma_fork: 564fail_nomem_anon_vma_fork:
540 mpol_put(vma_policy(tmp)); 565 mpol_put(vma_policy(tmp));
541fail_nomem_policy: 566fail_nomem_policy:
542 kmem_cache_free(vm_area_cachep, tmp); 567 vm_area_free(tmp);
543fail_nomem: 568fail_nomem:
544 retval = -ENOMEM; 569 retval = -ENOMEM;
545 vm_unacct_memory(charge); 570 vm_unacct_memory(charge);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..750cb8082694 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
177static void __kthread_parkme(struct kthread *self) 177static void __kthread_parkme(struct kthread *self)
178{ 178{
179 for (;;) { 179 for (;;) {
180 set_current_state(TASK_PARKED); 180 /*
181 * TASK_PARKED is a special state; we must serialize against
182 * possible pending wakeups to avoid store-store collisions on
183 * task->state.
184 *
185 * Such a collision might possibly result in the task state
186 * changin from TASK_PARKED and us failing the
187 * wait_task_inactive() in kthread_park().
188 */
189 set_special_state(TASK_PARKED);
181 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 190 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
182 break; 191 break;
192
193 complete_all(&self->parked);
183 schedule(); 194 schedule();
184 } 195 }
185 __set_current_state(TASK_RUNNING); 196 __set_current_state(TASK_RUNNING);
@@ -191,11 +202,6 @@ void kthread_parkme(void)
191} 202}
192EXPORT_SYMBOL_GPL(kthread_parkme); 203EXPORT_SYMBOL_GPL(kthread_parkme);
193 204
194void kthread_park_complete(struct task_struct *k)
195{
196 complete_all(&to_kthread(k)->parked);
197}
198
199static int kthread(void *_create) 205static int kthread(void *_create)
200{ 206{
201 /* Copy data: it's on kthread's stack */ 207 /* Copy data: it's on kthread's stack */
@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
461 467
462 reinit_completion(&kthread->parked); 468 reinit_completion(&kthread->parked);
463 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 469 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
470 /*
471 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
472 */
464 wake_up_state(k, TASK_PARKED); 473 wake_up_state(k, TASK_PARKED);
465} 474}
466EXPORT_SYMBOL_GPL(kthread_unpark); 475EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
487 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 496 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
488 if (k != current) { 497 if (k != current) {
489 wake_up_process(k); 498 wake_up_process(k);
499 /*
500 * Wait for __kthread_parkme() to complete(), this means we
501 * _will_ have TASK_PARKED and are about to call schedule().
502 */
490 wait_for_completion(&kthread->parked); 503 wait_for_completion(&kthread->parked);
504 /*
505 * Now wait for that schedule() to complete and the task to
506 * get scheduled out.
507 */
508 WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
491 } 509 }
492 510
493 return 0; 511 return 0;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 22b6acf1ad63..c6242d8594dc 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
85{ 85{
86 u32 cpu_id = raw_smp_processor_id(); 86 u32 cpu_id = raw_smp_processor_id();
87 87
88 if (__put_user(cpu_id, &t->rseq->cpu_id_start)) 88 if (put_user(cpu_id, &t->rseq->cpu_id_start))
89 return -EFAULT; 89 return -EFAULT;
90 if (__put_user(cpu_id, &t->rseq->cpu_id)) 90 if (put_user(cpu_id, &t->rseq->cpu_id))
91 return -EFAULT; 91 return -EFAULT;
92 trace_rseq_update(t); 92 trace_rseq_update(t);
93 return 0; 93 return 0;
@@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
100 /* 100 /*
101 * Reset cpu_id_start to its initial state (0). 101 * Reset cpu_id_start to its initial state (0).
102 */ 102 */
103 if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) 103 if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
104 return -EFAULT; 104 return -EFAULT;
105 /* 105 /*
106 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming 106 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
107 * in after unregistration can figure out that rseq needs to be 107 * in after unregistration can figure out that rseq needs to be
108 * registered again. 108 * registered again.
109 */ 109 */
110 if (__put_user(cpu_id, &t->rseq->cpu_id)) 110 if (put_user(cpu_id, &t->rseq->cpu_id))
111 return -EFAULT; 111 return -EFAULT;
112 return 0; 112 return 0;
113} 113}
@@ -115,29 +115,36 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
115static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 115static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
116{ 116{
117 struct rseq_cs __user *urseq_cs; 117 struct rseq_cs __user *urseq_cs;
118 unsigned long ptr; 118 u64 ptr;
119 u32 __user *usig; 119 u32 __user *usig;
120 u32 sig; 120 u32 sig;
121 int ret; 121 int ret;
122 122
123 ret = __get_user(ptr, &t->rseq->rseq_cs); 123 if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
124 if (ret) 124 return -EFAULT;
125 return ret;
126 if (!ptr) { 125 if (!ptr) {
127 memset(rseq_cs, 0, sizeof(*rseq_cs)); 126 memset(rseq_cs, 0, sizeof(*rseq_cs));
128 return 0; 127 return 0;
129 } 128 }
130 urseq_cs = (struct rseq_cs __user *)ptr; 129 if (ptr >= TASK_SIZE)
130 return -EINVAL;
131 urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
131 if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 132 if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
132 return -EFAULT; 133 return -EFAULT;
133 if (rseq_cs->version > 0)
134 return -EINVAL;
135 134
135 if (rseq_cs->start_ip >= TASK_SIZE ||
136 rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
137 rseq_cs->abort_ip >= TASK_SIZE ||
138 rseq_cs->version > 0)
139 return -EINVAL;
140 /* Check for overflow. */
141 if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
142 return -EINVAL;
136 /* Ensure that abort_ip is not in the critical section. */ 143 /* Ensure that abort_ip is not in the critical section. */
137 if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 144 if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
138 return -EINVAL; 145 return -EINVAL;
139 146
140 usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); 147 usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
141 ret = get_user(sig, usig); 148 ret = get_user(sig, usig);
142 if (ret) 149 if (ret)
143 return ret; 150 return ret;
@@ -146,7 +153,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
146 printk_ratelimited(KERN_WARNING 153 printk_ratelimited(KERN_WARNING
147 "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 154 "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
148 sig, current->rseq_sig, current->pid, usig); 155 sig, current->rseq_sig, current->pid, usig);
149 return -EPERM; 156 return -EINVAL;
150 } 157 }
151 return 0; 158 return 0;
152} 159}
@@ -157,7 +164,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
157 int ret; 164 int ret;
158 165
159 /* Get thread flags. */ 166 /* Get thread flags. */
160 ret = __get_user(flags, &t->rseq->flags); 167 ret = get_user(flags, &t->rseq->flags);
161 if (ret) 168 if (ret)
162 return ret; 169 return ret;
163 170
@@ -195,9 +202,11 @@ static int clear_rseq_cs(struct task_struct *t)
195 * of code outside of the rseq assembly block. This performs 202 * of code outside of the rseq assembly block. This performs
196 * a lazy clear of the rseq_cs field. 203 * a lazy clear of the rseq_cs field.
197 * 204 *
198 * Set rseq_cs to NULL with single-copy atomicity. 205 * Set rseq_cs to NULL.
199 */ 206 */
200 return __put_user(0UL, &t->rseq->rseq_cs); 207 if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
208 return -EFAULT;
209 return 0;
201} 210}
202 211
203/* 212/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8facba456..fe365c9a08e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
7 */ 7 */
8#include "sched.h" 8#include "sched.h"
9 9
10#include <linux/kthread.h>
11#include <linux/nospec.h> 10#include <linux/nospec.h>
12 11
13#include <linux/kcov.h> 12#include <linux/kcov.h>
@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2724 membarrier_mm_sync_core_before_usermode(mm); 2723 membarrier_mm_sync_core_before_usermode(mm);
2725 mmdrop(mm); 2724 mmdrop(mm);
2726 } 2725 }
2727 if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { 2726 if (unlikely(prev_state == TASK_DEAD)) {
2728 switch (prev_state) { 2727 if (prev->sched_class->task_dead)
2729 case TASK_DEAD: 2728 prev->sched_class->task_dead(prev);
2730 if (prev->sched_class->task_dead)
2731 prev->sched_class->task_dead(prev);
2732 2729
2733 /* 2730 /*
2734 * Remove function-return probe instances associated with this 2731 * Remove function-return probe instances associated with this
2735 * task and put them back on the free list. 2732 * task and put them back on the free list.
2736 */ 2733 */
2737 kprobe_flush_task(prev); 2734 kprobe_flush_task(prev);
2738
2739 /* Task is done with its stack. */
2740 put_task_stack(prev);
2741 2735
2742 put_task_struct(prev); 2736 /* Task is done with its stack. */
2743 break; 2737 put_task_stack(prev);
2744 2738
2745 case TASK_PARKED: 2739 put_task_struct(prev);
2746 kthread_park_complete(prev);
2747 break;
2748 }
2749 } 2740 }
2750 2741
2751 tick_nohz_task_switch(); 2742 tick_nohz_task_switch();
@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
3113 struct tick_work *twork = container_of(dwork, struct tick_work, work); 3104 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3114 int cpu = twork->cpu; 3105 int cpu = twork->cpu;
3115 struct rq *rq = cpu_rq(cpu); 3106 struct rq *rq = cpu_rq(cpu);
3107 struct task_struct *curr;
3116 struct rq_flags rf; 3108 struct rq_flags rf;
3109 u64 delta;
3117 3110
3118 /* 3111 /*
3119 * Handle the tick only if it appears the remote CPU is running in full 3112 * Handle the tick only if it appears the remote CPU is running in full
@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
3122 * statistics and checks timeslices in a time-independent way, regardless 3115 * statistics and checks timeslices in a time-independent way, regardless
3123 * of when exactly it is running. 3116 * of when exactly it is running.
3124 */ 3117 */
3125 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { 3118 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3126 struct task_struct *curr; 3119 goto out_requeue;
3127 u64 delta;
3128 3120
3129 rq_lock_irq(rq, &rf); 3121 rq_lock_irq(rq, &rf);
3130 update_rq_clock(rq); 3122 curr = rq->curr;
3131 curr = rq->curr; 3123 if (is_idle_task(curr))
3132 delta = rq_clock_task(rq) - curr->se.exec_start; 3124 goto out_unlock;
3133 3125
3134 /* 3126 update_rq_clock(rq);
3135 * Make sure the next tick runs within a reasonable 3127 delta = rq_clock_task(rq) - curr->se.exec_start;
3136 * amount of time. 3128
3137 */ 3129 /*
3138 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3130 * Make sure the next tick runs within a reasonable
3139 curr->sched_class->task_tick(rq, curr, 0); 3131 * amount of time.
3140 rq_unlock_irq(rq, &rf); 3132 */
3141 } 3133 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3134 curr->sched_class->task_tick(rq, curr, 0);
3135
3136out_unlock:
3137 rq_unlock_irq(rq, &rf);
3142 3138
3139out_requeue:
3143 /* 3140 /*
3144 * Run the remote tick once per second (1Hz). This arbitrary 3141 * Run the remote tick once per second (1Hz). This arbitrary
3145 * frequency is large enough to avoid overload but short enough 3142 * frequency is large enough to avoid overload but short enough
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3cde46483f0a..c907fde01eaa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
192{ 192{
193 struct rq *rq = cpu_rq(sg_cpu->cpu); 193 struct rq *rq = cpu_rq(sg_cpu->cpu);
194 194
195 if (rq->rt.rt_nr_running) 195 if (rt_rq_is_runnable(&rq->rt))
196 return sg_cpu->max; 196 return sg_cpu->max;
197 197
198 /* 198 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1866e64792a7..2f0a0be4d344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3982 if (!sched_feat(UTIL_EST)) 3982 if (!sched_feat(UTIL_EST))
3983 return; 3983 return;
3984 3984
3985 /* 3985 /* Update root cfs_rq's estimated utilization */
3986 * Update root cfs_rq's estimated utilization 3986 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3987 * 3987 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3988 * If *p is the last task then the root cfs_rq's estimated utilization 3988 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3989 * of a CPU is 0 by definition.
3990 */
3991 ue.enqueued = 0;
3992 if (cfs_rq->nr_running) {
3993 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3994 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3995 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3996 }
3997 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); 3989 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3998 3990
3999 /* 3991 /*
@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4590 now = sched_clock_cpu(smp_processor_id()); 4582 now = sched_clock_cpu(smp_processor_id());
4591 cfs_b->runtime = cfs_b->quota; 4583 cfs_b->runtime = cfs_b->quota;
4592 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 4584 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4585 cfs_b->expires_seq++;
4593} 4586}
4594 4587
4595static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 4588static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4612 struct task_group *tg = cfs_rq->tg; 4605 struct task_group *tg = cfs_rq->tg;
4613 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 4606 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4614 u64 amount = 0, min_amount, expires; 4607 u64 amount = 0, min_amount, expires;
4608 int expires_seq;
4615 4609
4616 /* note: this is a positive sum as runtime_remaining <= 0 */ 4610 /* note: this is a positive sum as runtime_remaining <= 0 */
4617 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; 4611 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4628 cfs_b->idle = 0; 4622 cfs_b->idle = 0;
4629 } 4623 }
4630 } 4624 }
4625 expires_seq = cfs_b->expires_seq;
4631 expires = cfs_b->runtime_expires; 4626 expires = cfs_b->runtime_expires;
4632 raw_spin_unlock(&cfs_b->lock); 4627 raw_spin_unlock(&cfs_b->lock);
4633 4628
@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4637 * spread between our sched_clock and the one on which runtime was 4632 * spread between our sched_clock and the one on which runtime was
4638 * issued. 4633 * issued.
4639 */ 4634 */
4640 if ((s64)(expires - cfs_rq->runtime_expires) > 0) 4635 if (cfs_rq->expires_seq != expires_seq) {
4636 cfs_rq->expires_seq = expires_seq;
4641 cfs_rq->runtime_expires = expires; 4637 cfs_rq->runtime_expires = expires;
4638 }
4642 4639
4643 return cfs_rq->runtime_remaining > 0; 4640 return cfs_rq->runtime_remaining > 0;
4644} 4641}
@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4664 * has not truly expired. 4661 * has not truly expired.
4665 * 4662 *
4666 * Fortunately we can check determine whether this the case by checking 4663 * Fortunately we can check determine whether this the case by checking
4667 * whether the global deadline has advanced. It is valid to compare 4664 * whether the global deadline(cfs_b->expires_seq) has advanced.
4668 * cfs_b->runtime_expires without any locks since we only care about
4669 * exact equality, so a partial write will still work.
4670 */ 4665 */
4671 4666 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4672 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4673 /* extend local deadline, drift is bounded above by 2 ticks */ 4667 /* extend local deadline, drift is bounded above by 2 ticks */
4674 cfs_rq->runtime_expires += TICK_NSEC; 4668 cfs_rq->runtime_expires += TICK_NSEC;
4675 } else { 4669 } else {
@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5202 5196
5203void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 5197void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5204{ 5198{
5199 u64 overrun;
5200
5205 lockdep_assert_held(&cfs_b->lock); 5201 lockdep_assert_held(&cfs_b->lock);
5206 5202
5207 if (!cfs_b->period_active) { 5203 if (cfs_b->period_active)
5208 cfs_b->period_active = 1; 5204 return;
5209 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 5205
5210 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 5206 cfs_b->period_active = 1;
5211 } 5207 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5208 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
5209 cfs_b->expires_seq++;
5210 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5212} 5211}
5213 5212
5214static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 5213static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 47556b0c9a95..572567078b60 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
508 508
509 rt_se = rt_rq->tg->rt_se[cpu]; 509 rt_se = rt_rq->tg->rt_se[cpu];
510 510
511 if (!rt_se) 511 if (!rt_se) {
512 dequeue_top_rt_rq(rt_rq); 512 dequeue_top_rt_rq(rt_rq);
513 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
514 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
515 }
513 else if (on_rt_rq(rt_se)) 516 else if (on_rt_rq(rt_se))
514 dequeue_rt_entity(rt_se, 0); 517 dequeue_rt_entity(rt_se, 0);
515} 518}
@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
1001 sub_nr_running(rq, rt_rq->rt_nr_running); 1004 sub_nr_running(rq, rt_rq->rt_nr_running);
1002 rt_rq->rt_queued = 0; 1005 rt_rq->rt_queued = 0;
1003 1006
1004 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1005 cpufreq_update_util(rq, 0);
1006} 1007}
1007 1008
1008static void 1009static void
@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
1014 1015
1015 if (rt_rq->rt_queued) 1016 if (rt_rq->rt_queued)
1016 return; 1017 return;
1017 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1018
1019 if (rt_rq_throttled(rt_rq))
1018 return; 1020 return;
1019 1021
1020 add_nr_running(rq, rt_rq->rt_nr_running); 1022 if (rt_rq->rt_nr_running) {
1021 rt_rq->rt_queued = 1; 1023 add_nr_running(rq, rt_rq->rt_nr_running);
1024 rt_rq->rt_queued = 1;
1025 }
1022 1026
1023 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1027 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1024 cpufreq_update_util(rq, 0); 1028 cpufreq_update_util(rq, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6601baf2361c..c7742dcc136c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
334 u64 runtime; 334 u64 runtime;
335 s64 hierarchical_quota; 335 s64 hierarchical_quota;
336 u64 runtime_expires; 336 u64 runtime_expires;
337 int expires_seq;
337 338
338 int idle; 339 short idle;
339 int period_active; 340 short period_active;
340 struct hrtimer period_timer; 341 struct hrtimer period_timer;
341 struct hrtimer slack_timer; 342 struct hrtimer slack_timer;
342 struct list_head throttled_cfs_rq; 343 struct list_head throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
551 552
552#ifdef CONFIG_CFS_BANDWIDTH 553#ifdef CONFIG_CFS_BANDWIDTH
553 int runtime_enabled; 554 int runtime_enabled;
555 int expires_seq;
554 u64 runtime_expires; 556 u64 runtime_expires;
555 s64 runtime_remaining; 557 s64 runtime_remaining;
556 558
@@ -609,6 +611,11 @@ struct rt_rq {
609#endif 611#endif
610}; 612};
611 613
614static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
615{
616 return rt_rq->rt_queued && rt_rq->rt_nr_running;
617}
618
612/* Deadline class' related fields in a runqueue */ 619/* Deadline class' related fields in a runqueue */
613struct dl_rq { 620struct dl_rq {
614 /* runqueue is an rbtree, ordered by deadline */ 621 /* runqueue is an rbtree, ordered by deadline */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 900dcfee542c..75ffc1d1a2e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
79 79
80/* 80/*
81 * If ksoftirqd is scheduled, we do not want to process pending softirqs 81 * If ksoftirqd is scheduled, we do not want to process pending softirqs
82 * right now. Let ksoftirqd handle this at its own rate, to get fairness. 82 * right now. Let ksoftirqd handle this at its own rate, to get fairness,
83 * unless we're doing some of the synchronous softirqs.
83 */ 84 */
84static bool ksoftirqd_running(void) 85#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
86static bool ksoftirqd_running(unsigned long pending)
85{ 87{
86 struct task_struct *tsk = __this_cpu_read(ksoftirqd); 88 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
87 89
90 if (pending & SOFTIRQ_NOW_MASK)
91 return false;
88 return tsk && (tsk->state == TASK_RUNNING); 92 return tsk && (tsk->state == TASK_RUNNING);
89} 93}
90 94
@@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void)
328 332
329 pending = local_softirq_pending(); 333 pending = local_softirq_pending();
330 334
331 if (pending && !ksoftirqd_running()) 335 if (pending && !ksoftirqd_running(pending))
332 do_softirq_own_stack(); 336 do_softirq_own_stack();
333 337
334 local_irq_restore(flags); 338 local_irq_restore(flags);
@@ -355,7 +359,7 @@ void irq_enter(void)
355 359
356static inline void invoke_softirq(void) 360static inline void invoke_softirq(void)
357{ 361{
358 if (ksoftirqd_running()) 362 if (ksoftirqd_running(local_softirq_pending()))
359 return; 363 return;
360 364
361 if (!force_irqthreads) { 365 if (!force_irqthreads) {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b7005dd21ec1..14de3727b18e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
277 */ 277 */
278 return !curdev || 278 return !curdev ||
279 newdev->rating > curdev->rating || 279 newdev->rating > curdev->rating ||
280 (!cpumask_equal(curdev->cpumask, newdev->cpumask) && 280 !cpumask_equal(curdev->cpumask, newdev->cpumask);
281 !tick_check_percpu(curdev, newdev, smp_processor_id()));
282} 281}
283 282
284/* 283/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index efed9c1cfb7e..caf9cbf35816 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
192 op->saved_func(ip, parent_ip, op, regs); 192 op->saved_func(ip, parent_ip, op, regs);
193} 193}
194 194
195/**
196 * clear_ftrace_function - reset the ftrace function
197 *
198 * This NULLs the ftrace function and in essence stops
199 * tracing. There may be lag
200 */
201void clear_ftrace_function(void)
202{
203 ftrace_trace_function = ftrace_stub;
204}
205
206static void ftrace_sync(struct work_struct *work) 195static void ftrace_sync(struct work_struct *work)
207{ 196{
208 /* 197 /*
@@ -6689,7 +6678,7 @@ void ftrace_kill(void)
6689{ 6678{
6690 ftrace_disabled = 1; 6679 ftrace_disabled = 1;
6691 ftrace_enabled = 0; 6680 ftrace_enabled = 0;
6692 clear_ftrace_function(); 6681 ftrace_trace_function = ftrace_stub;
6693} 6682}
6694 6683
6695/** 6684/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0079b4c7a49..87cf25171fb8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2953,6 +2953,7 @@ out_nobuffer:
2953} 2953}
2954EXPORT_SYMBOL_GPL(trace_vbprintk); 2954EXPORT_SYMBOL_GPL(trace_vbprintk);
2955 2955
2956__printf(3, 0)
2956static int 2957static int
2957__trace_array_vprintk(struct ring_buffer *buffer, 2958__trace_array_vprintk(struct ring_buffer *buffer,
2958 unsigned long ip, const char *fmt, va_list args) 2959 unsigned long ip, const char *fmt, va_list args)
@@ -3007,12 +3008,14 @@ out_nobuffer:
3007 return len; 3008 return len;
3008} 3009}
3009 3010
3011__printf(3, 0)
3010int trace_array_vprintk(struct trace_array *tr, 3012int trace_array_vprintk(struct trace_array *tr,
3011 unsigned long ip, const char *fmt, va_list args) 3013 unsigned long ip, const char *fmt, va_list args)
3012{ 3014{
3013 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); 3015 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
3014} 3016}
3015 3017
3018__printf(3, 0)
3016int trace_array_printk(struct trace_array *tr, 3019int trace_array_printk(struct trace_array *tr,
3017 unsigned long ip, const char *fmt, ...) 3020 unsigned long ip, const char *fmt, ...)
3018{ 3021{
@@ -3028,6 +3031,7 @@ int trace_array_printk(struct trace_array *tr,
3028 return ret; 3031 return ret;
3029} 3032}
3030 3033
3034__printf(3, 4)
3031int trace_array_printk_buf(struct ring_buffer *buffer, 3035int trace_array_printk_buf(struct ring_buffer *buffer,
3032 unsigned long ip, const char *fmt, ...) 3036 unsigned long ip, const char *fmt, ...)
3033{ 3037{
@@ -3043,6 +3047,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
3043 return ret; 3047 return ret;
3044} 3048}
3045 3049
3050__printf(2, 0)
3046int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 3051int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
3047{ 3052{
3048 return trace_array_vprintk(&global_trace, ip, fmt, args); 3053 return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -3360,8 +3365,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
3360 3365
3361 print_event_info(buf, m); 3366 print_event_info(buf, m);
3362 3367
3363 seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); 3368 seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
3364 seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); 3369 seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
3365} 3370}
3366 3371
3367static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, 3372static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
@@ -3381,9 +3386,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
3381 tgid ? tgid_space : space); 3386 tgid ? tgid_space : space);
3382 seq_printf(m, "# %s||| / delay\n", 3387 seq_printf(m, "# %s||| / delay\n",
3383 tgid ? tgid_space : space); 3388 tgid ? tgid_space : space);
3384 seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", 3389 seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
3385 tgid ? " TGID " : space); 3390 tgid ? " TGID " : space);
3386 seq_printf(m, "# | | | %s|||| | |\n", 3391 seq_printf(m, "# | | %s | |||| | |\n",
3387 tgid ? " | " : space); 3392 tgid ? " | " : space);
3388} 3393}
3389 3394
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 630c5a24b2b2..f8f86231ad90 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit)
583static inline struct ring_buffer_iter * 583static inline struct ring_buffer_iter *
584trace_buffer_iter(struct trace_iterator *iter, int cpu) 584trace_buffer_iter(struct trace_iterator *iter, int cpu)
585{ 585{
586 if (iter->buffer_iter && iter->buffer_iter[cpu]) 586 return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
587 return iter->buffer_iter[cpu];
588 return NULL;
589} 587}
590 588
591int tracer_init(struct tracer *t, struct trace_array *tr); 589int tracer_init(struct tracer *t, struct trace_array *tr);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0dceb77d1d42..893a206bcba4 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
1701 * @filter_str: filter string 1701 * @filter_str: filter string
1702 * @set_str: remember @filter_str and enable detailed error in filter 1702 * @set_str: remember @filter_str and enable detailed error in filter
1703 * @filterp: out param for created filter (always updated on return) 1703 * @filterp: out param for created filter (always updated on return)
1704 * Must be a pointer that references a NULL pointer.
1704 * 1705 *
1705 * Creates a filter for @call with @filter_str. If @set_str is %true, 1706 * Creates a filter for @call with @filter_str. If @set_str is %true,
1706 * @filter_str is copied and recorded in the new filter. 1707 * @filter_str is copied and recorded in the new filter.
@@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call,
1718 struct filter_parse_error *pe = NULL; 1719 struct filter_parse_error *pe = NULL;
1719 int err; 1720 int err;
1720 1721
1722 /* filterp must point to NULL */
1723 if (WARN_ON(*filterp))
1724 *filterp = NULL;
1725
1721 err = create_filter_start(filter_string, set_str, &pe, filterp); 1726 err = create_filter_start(filter_string, set_str, &pe, filterp);
1722 if (err) 1727 if (err)
1723 return err; 1728 return err;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 046c716a6536..aae18af94c94 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var)
393 else if (system) 393 else if (system)
394 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); 394 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
395 else 395 else
396 strncpy(err, var, MAX_FILTER_STR_VAL); 396 strscpy(err, var, MAX_FILTER_STR_VAL);
397 397
398 hist_err(str, err); 398 hist_err(str, err);
399} 399}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 23c0b0cb5fb9..169b3c44ee97 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
831 struct ftrace_graph_ret *graph_ret; 831 struct ftrace_graph_ret *graph_ret;
832 struct ftrace_graph_ent *call; 832 struct ftrace_graph_ent *call;
833 unsigned long long duration; 833 unsigned long long duration;
834 int cpu = iter->cpu;
834 int i; 835 int i;
835 836
836 graph_ret = &ret_entry->ret; 837 graph_ret = &ret_entry->ret;
@@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
839 840
840 if (data) { 841 if (data) {
841 struct fgraph_cpu_data *cpu_data; 842 struct fgraph_cpu_data *cpu_data;
842 int cpu = iter->cpu;
843 843
844 cpu_data = per_cpu_ptr(data->cpu_data, cpu); 844 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
845 845
@@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter,
869 869
870 trace_seq_printf(s, "%ps();\n", (void *)call->func); 870 trace_seq_printf(s, "%ps();\n", (void *)call->func);
871 871
872 print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
873 cpu, iter->ent->pid, flags);
874
872 return trace_handle_return(s); 875 return trace_handle_return(s);
873} 876}
874 877
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index daa81571b22a..21f718472942 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1480,8 +1480,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
1480 } 1480 }
1481 1481
1482 ret = __register_trace_kprobe(tk); 1482 ret = __register_trace_kprobe(tk);
1483 if (ret < 0) 1483 if (ret < 0) {
1484 kfree(tk->tp.call.print_fmt);
1484 goto error; 1485 goto error;
1486 }
1485 1487
1486 return &tk->tp.call; 1488 return &tk->tp.call;
1487error: 1489error:
@@ -1501,6 +1503,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call)
1501 } 1503 }
1502 1504
1503 __unregister_trace_kprobe(tk); 1505 __unregister_trace_kprobe(tk);
1506
1507 kfree(tk->tp.call.print_fmt);
1504 free_trace_kprobe(tk); 1508 free_trace_kprobe(tk);
1505} 1509}
1506#endif /* CONFIG_PERF_EVENTS */ 1510#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 90db994ac900..1c8e30fda46a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter)
594 594
595 trace_find_cmdline(entry->pid, comm); 595 trace_find_cmdline(entry->pid, comm);
596 596
597 trace_seq_printf(s, "%16s-%-5d [%03d] ", 597 trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
598 comm, entry->pid, iter->cpu);
599 598
600 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { 599 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
601 unsigned int tgid = trace_find_tgid(entry->pid); 600 unsigned int tgid = trace_find_tgid(entry->pid);
@@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter)
606 trace_seq_printf(s, "(%5d) ", tgid); 605 trace_seq_printf(s, "(%5d) ", tgid);
607 } 606 }
608 607
608 trace_seq_printf(s, "[%03d] ", iter->cpu);
609
609 if (tr->trace_flags & TRACE_ITER_IRQ_INFO) 610 if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
610 trace_print_lat_fmt(s, entry); 611 trace_print_lat_fmt(s, entry);
611 612