diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/btf.c | 30 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 54 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 30 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 7 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/sockmap.c | 297 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 103 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 11 | ||||
| -rw-r--r-- | kernel/fork.c | 35 | ||||
| -rw-r--r-- | kernel/kthread.c | 30 | ||||
| -rw-r--r-- | kernel/rseq.c | 41 | ||||
| -rw-r--r-- | kernel/sched/core.c | 67 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 2 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 45 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 16 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 11 | ||||
| -rw-r--r-- | kernel/softirq.c | 12 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 3 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 13 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 13 | ||||
| -rw-r--r-- | kernel/trace/trace.h | 4 | ||||
| -rw-r--r-- | kernel/trace/trace_events_filter.c | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_events_hist.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_functions_graph.c | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_kprobe.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace_output.c | 5 |
26 files changed, 520 insertions, 343 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..e016ac3afa24 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c | |||
| @@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf, | |||
| 991 | void *data, u8 bits_offset, | 991 | void *data, u8 bits_offset, |
| 992 | struct seq_file *m) | 992 | struct seq_file *m) |
| 993 | { | 993 | { |
| 994 | u16 left_shift_bits, right_shift_bits; | ||
| 994 | u32 int_data = btf_type_int(t); | 995 | u32 int_data = btf_type_int(t); |
| 995 | u16 nr_bits = BTF_INT_BITS(int_data); | 996 | u16 nr_bits = BTF_INT_BITS(int_data); |
| 996 | u16 total_bits_offset; | 997 | u16 total_bits_offset; |
| 997 | u16 nr_copy_bytes; | 998 | u16 nr_copy_bytes; |
| 998 | u16 nr_copy_bits; | 999 | u16 nr_copy_bits; |
| 999 | u8 nr_upper_bits; | 1000 | u64 print_num; |
| 1000 | union { | ||
| 1001 | u64 u64_num; | ||
| 1002 | u8 u8_nums[8]; | ||
| 1003 | } print_num; | ||
| 1004 | 1001 | ||
| 1005 | total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); | 1002 | total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); |
| 1006 | data += BITS_ROUNDDOWN_BYTES(total_bits_offset); | 1003 | data += BITS_ROUNDDOWN_BYTES(total_bits_offset); |
| @@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf, | |||
| 1008 | nr_copy_bits = nr_bits + bits_offset; | 1005 | nr_copy_bits = nr_bits + bits_offset; |
| 1009 | nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); | 1006 | nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); |
| 1010 | 1007 | ||
| 1011 | print_num.u64_num = 0; | 1008 | print_num = 0; |
| 1012 | memcpy(&print_num.u64_num, data, nr_copy_bytes); | 1009 | memcpy(&print_num, data, nr_copy_bytes); |
| 1013 | 1010 | ||
| 1014 | /* Ditch the higher order bits */ | 1011 | #ifdef __BIG_ENDIAN_BITFIELD |
| 1015 | nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); | 1012 | left_shift_bits = bits_offset; |
| 1016 | if (nr_upper_bits) { | 1013 | #else |
| 1017 | /* We need to mask out some bits of the upper byte. */ | 1014 | left_shift_bits = BITS_PER_U64 - nr_copy_bits; |
| 1018 | u8 mask = (1 << nr_upper_bits) - 1; | 1015 | #endif |
| 1016 | right_shift_bits = BITS_PER_U64 - nr_bits; | ||
| 1019 | 1017 | ||
| 1020 | print_num.u8_nums[nr_copy_bytes - 1] &= mask; | 1018 | print_num <<= left_shift_bits; |
| 1021 | } | 1019 | print_num >>= right_shift_bits; |
| 1022 | |||
| 1023 | print_num.u64_num >>= bits_offset; | ||
| 1024 | 1020 | ||
| 1025 | seq_printf(m, "0x%llx", print_num.u64_num); | 1021 | seq_printf(m, "0x%llx", print_num); |
| 1026 | } | 1022 | } |
| 1027 | 1023 | ||
| 1028 | static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, | 1024 | static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..3d83ee7df381 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
| @@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, | |||
| 428 | return ret; | 428 | return ret; |
| 429 | } | 429 | } |
| 430 | 430 | ||
| 431 | int cgroup_bpf_prog_attach(const union bpf_attr *attr, | ||
| 432 | enum bpf_prog_type ptype, struct bpf_prog *prog) | ||
| 433 | { | ||
| 434 | struct cgroup *cgrp; | ||
| 435 | int ret; | ||
| 436 | |||
| 437 | cgrp = cgroup_get_from_fd(attr->target_fd); | ||
| 438 | if (IS_ERR(cgrp)) | ||
| 439 | return PTR_ERR(cgrp); | ||
| 440 | |||
| 441 | ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, | ||
| 442 | attr->attach_flags); | ||
| 443 | cgroup_put(cgrp); | ||
| 444 | return ret; | ||
| 445 | } | ||
| 446 | |||
| 447 | int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) | ||
| 448 | { | ||
| 449 | struct bpf_prog *prog; | ||
| 450 | struct cgroup *cgrp; | ||
| 451 | int ret; | ||
| 452 | |||
| 453 | cgrp = cgroup_get_from_fd(attr->target_fd); | ||
| 454 | if (IS_ERR(cgrp)) | ||
| 455 | return PTR_ERR(cgrp); | ||
| 456 | |||
| 457 | prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); | ||
| 458 | if (IS_ERR(prog)) | ||
| 459 | prog = NULL; | ||
| 460 | |||
| 461 | ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); | ||
| 462 | if (prog) | ||
| 463 | bpf_prog_put(prog); | ||
| 464 | |||
| 465 | cgroup_put(cgrp); | ||
| 466 | return ret; | ||
| 467 | } | ||
| 468 | |||
| 469 | int cgroup_bpf_prog_query(const union bpf_attr *attr, | ||
| 470 | union bpf_attr __user *uattr) | ||
| 471 | { | ||
| 472 | struct cgroup *cgrp; | ||
| 473 | int ret; | ||
| 474 | |||
| 475 | cgrp = cgroup_get_from_fd(attr->query.target_fd); | ||
| 476 | if (IS_ERR(cgrp)) | ||
| 477 | return PTR_ERR(cgrp); | ||
| 478 | |||
| 479 | ret = cgroup_bpf_query(cgrp, attr, uattr); | ||
| 480 | |||
| 481 | cgroup_put(cgrp); | ||
| 482 | return ret; | ||
| 483 | } | ||
| 484 | |||
| 431 | /** | 485 | /** |
| 432 | * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering | 486 | * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering |
| 433 | * @sk: The socket sending or receiving traffic | 487 | * @sk: The socket sending or receiving traffic |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..1e5625d46414 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, | |||
| 598 | bpf_fill_ill_insns(hdr, size); | 598 | bpf_fill_ill_insns(hdr, size); |
| 599 | 599 | ||
| 600 | hdr->pages = size / PAGE_SIZE; | 600 | hdr->pages = size / PAGE_SIZE; |
| 601 | hdr->locked = 0; | ||
| 602 | |||
| 603 | hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), | 601 | hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), |
| 604 | PAGE_SIZE - sizeof(*hdr)); | 602 | PAGE_SIZE - sizeof(*hdr)); |
| 605 | start = (get_random_int() % hole) & ~(alignment - 1); | 603 | start = (get_random_int() % hole) & ~(alignment - 1); |
| @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) | |||
| 1450 | return 0; | 1448 | return 0; |
| 1451 | } | 1449 | } |
| 1452 | 1450 | ||
| 1453 | static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) | ||
| 1454 | { | ||
| 1455 | #ifdef CONFIG_ARCH_HAS_SET_MEMORY | ||
| 1456 | int i, err; | ||
| 1457 | |||
| 1458 | for (i = 0; i < fp->aux->func_cnt; i++) { | ||
| 1459 | err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); | ||
| 1460 | if (err) | ||
| 1461 | return err; | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | return bpf_prog_check_pages_ro_single(fp); | ||
| 1465 | #endif | ||
| 1466 | return 0; | ||
| 1467 | } | ||
| 1468 | |||
| 1469 | static void bpf_prog_select_func(struct bpf_prog *fp) | 1451 | static void bpf_prog_select_func(struct bpf_prog *fp) |
| 1470 | { | 1452 | { |
| 1471 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | 1453 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON |
| @@ -1524,17 +1506,7 @@ finalize: | |||
| 1524 | * all eBPF JITs might immediately support all features. | 1506 | * all eBPF JITs might immediately support all features. |
| 1525 | */ | 1507 | */ |
| 1526 | *err = bpf_check_tail_call(fp); | 1508 | *err = bpf_check_tail_call(fp); |
| 1527 | if (*err) | 1509 | |
| 1528 | return fp; | ||
| 1529 | |||
| 1530 | /* Checkpoint: at this point onwards any cBPF -> eBPF or | ||
| 1531 | * native eBPF program is read-only. If we failed to change | ||
| 1532 | * the page attributes (e.g. allocation failure from | ||
| 1533 | * splitting large pages), then reject the whole program | ||
| 1534 | * in order to guarantee not ending up with any W+X pages | ||
| 1535 | * from BPF side in kernel. | ||
| 1536 | */ | ||
| 1537 | *err = bpf_prog_check_pages_ro_locked(fp); | ||
| 1538 | return fp; | 1510 | return fp; |
| 1539 | } | 1511 | } |
| 1540 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); | 1512 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
| @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, | |||
| 334 | { | 334 | { |
| 335 | struct net_device *dev = dst->dev; | 335 | struct net_device *dev = dst->dev; |
| 336 | struct xdp_frame *xdpf; | 336 | struct xdp_frame *xdpf; |
| 337 | int err; | ||
| 337 | 338 | ||
| 338 | if (!dev->netdev_ops->ndo_xdp_xmit) | 339 | if (!dev->netdev_ops->ndo_xdp_xmit) |
| 339 | return -EOPNOTSUPP; | 340 | return -EOPNOTSUPP; |
| 340 | 341 | ||
| 342 | err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); | ||
| 343 | if (unlikely(err)) | ||
| 344 | return err; | ||
| 345 | |||
| 341 | xdpf = convert_to_xdp_frame(xdp); | 346 | xdpf = convert_to_xdp_frame(xdp); |
| 342 | if (unlikely(!xdpf)) | 347 | if (unlikely(!xdpf)) |
| 343 | return -EOVERFLOW; | 348 | return -EOVERFLOW; |
| @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, | |||
| 350 | { | 355 | { |
| 351 | int err; | 356 | int err; |
| 352 | 357 | ||
| 353 | err = __xdp_generic_ok_fwd_dev(skb, dst->dev); | 358 | err = xdp_ok_fwd_dev(dst->dev, skb->len); |
| 354 | if (unlikely(err)) | 359 | if (unlikely(err)) |
| 355 | return err; | 360 | return err; |
| 356 | skb->dev = dst->dev; | 361 | skb->dev = dst->dev; |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..513d9dfcf4ee 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 747 | * old element will be freed immediately. | 747 | * old element will be freed immediately. |
| 748 | * Otherwise return an error | 748 | * Otherwise return an error |
| 749 | */ | 749 | */ |
| 750 | atomic_dec(&htab->count); | 750 | l_new = ERR_PTR(-E2BIG); |
| 751 | return ERR_PTR(-E2BIG); | 751 | goto dec_count; |
| 752 | } | 752 | } |
| 753 | l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, | 753 | l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, |
| 754 | htab->map.numa_node); | 754 | htab->map.numa_node); |
| 755 | if (!l_new) | 755 | if (!l_new) { |
| 756 | return ERR_PTR(-ENOMEM); | 756 | l_new = ERR_PTR(-ENOMEM); |
| 757 | goto dec_count; | ||
| 758 | } | ||
| 757 | } | 759 | } |
| 758 | 760 | ||
| 759 | memcpy(l_new->key, key, key_size); | 761 | memcpy(l_new->key, key, key_size); |
| @@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 766 | GFP_ATOMIC | __GFP_NOWARN); | 768 | GFP_ATOMIC | __GFP_NOWARN); |
| 767 | if (!pptr) { | 769 | if (!pptr) { |
| 768 | kfree(l_new); | 770 | kfree(l_new); |
| 769 | return ERR_PTR(-ENOMEM); | 771 | l_new = ERR_PTR(-ENOMEM); |
| 772 | goto dec_count; | ||
| 770 | } | 773 | } |
| 771 | } | 774 | } |
| 772 | 775 | ||
| @@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 780 | 783 | ||
| 781 | l_new->hash = hash; | 784 | l_new->hash = hash; |
| 782 | return l_new; | 785 | return l_new; |
| 786 | dec_count: | ||
| 787 | atomic_dec(&htab->count); | ||
| 788 | return l_new; | ||
| 783 | } | 789 | } |
| 784 | 790 | ||
| 785 | static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, | 791 | static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, |
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c | |||
| @@ -72,6 +72,7 @@ struct bpf_htab { | |||
| 72 | u32 n_buckets; | 72 | u32 n_buckets; |
| 73 | u32 elem_size; | 73 | u32 elem_size; |
| 74 | struct bpf_sock_progs progs; | 74 | struct bpf_sock_progs progs; |
| 75 | struct rcu_head rcu; | ||
| 75 | }; | 76 | }; |
| 76 | 77 | ||
| 77 | struct htab_elem { | 78 | struct htab_elem { |
| @@ -89,8 +90,8 @@ enum smap_psock_state { | |||
| 89 | struct smap_psock_map_entry { | 90 | struct smap_psock_map_entry { |
| 90 | struct list_head list; | 91 | struct list_head list; |
| 91 | struct sock **entry; | 92 | struct sock **entry; |
| 92 | struct htab_elem *hash_link; | 93 | struct htab_elem __rcu *hash_link; |
| 93 | struct bpf_htab *htab; | 94 | struct bpf_htab __rcu *htab; |
| 94 | }; | 95 | }; |
| 95 | 96 | ||
| 96 | struct smap_psock { | 97 | struct smap_psock { |
| @@ -120,6 +121,7 @@ struct smap_psock { | |||
| 120 | struct bpf_prog *bpf_parse; | 121 | struct bpf_prog *bpf_parse; |
| 121 | struct bpf_prog *bpf_verdict; | 122 | struct bpf_prog *bpf_verdict; |
| 122 | struct list_head maps; | 123 | struct list_head maps; |
| 124 | spinlock_t maps_lock; | ||
| 123 | 125 | ||
| 124 | /* Back reference used when sock callback trigger sockmap operations */ | 126 | /* Back reference used when sock callback trigger sockmap operations */ |
| 125 | struct sock *sock; | 127 | struct sock *sock; |
| @@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, | |||
| 140 | static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); | 142 | static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); |
| 141 | static int bpf_tcp_sendpage(struct sock *sk, struct page *page, | 143 | static int bpf_tcp_sendpage(struct sock *sk, struct page *page, |
| 142 | int offset, size_t size, int flags); | 144 | int offset, size_t size, int flags); |
| 145 | static void bpf_tcp_close(struct sock *sk, long timeout); | ||
| 143 | 146 | ||
| 144 | static inline struct smap_psock *smap_psock_sk(const struct sock *sk) | 147 | static inline struct smap_psock *smap_psock_sk(const struct sock *sk) |
| 145 | { | 148 | { |
| @@ -161,7 +164,42 @@ out: | |||
| 161 | return !empty; | 164 | return !empty; |
| 162 | } | 165 | } |
| 163 | 166 | ||
| 164 | static struct proto tcp_bpf_proto; | 167 | enum { |
| 168 | SOCKMAP_IPV4, | ||
| 169 | SOCKMAP_IPV6, | ||
| 170 | SOCKMAP_NUM_PROTS, | ||
| 171 | }; | ||
| 172 | |||
| 173 | enum { | ||
| 174 | SOCKMAP_BASE, | ||
| 175 | SOCKMAP_TX, | ||
| 176 | SOCKMAP_NUM_CONFIGS, | ||
| 177 | }; | ||
| 178 | |||
| 179 | static struct proto *saved_tcpv6_prot __read_mostly; | ||
| 180 | static DEFINE_SPINLOCK(tcpv6_prot_lock); | ||
| 181 | static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; | ||
| 182 | static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], | ||
| 183 | struct proto *base) | ||
| 184 | { | ||
| 185 | prot[SOCKMAP_BASE] = *base; | ||
| 186 | prot[SOCKMAP_BASE].close = bpf_tcp_close; | ||
| 187 | prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; | ||
| 188 | prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; | ||
| 189 | |||
| 190 | prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; | ||
| 191 | prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; | ||
| 192 | prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; | ||
| 193 | } | ||
| 194 | |||
| 195 | static void update_sk_prot(struct sock *sk, struct smap_psock *psock) | ||
| 196 | { | ||
| 197 | int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; | ||
| 198 | int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; | ||
| 199 | |||
| 200 | sk->sk_prot = &bpf_tcp_prots[family][conf]; | ||
| 201 | } | ||
| 202 | |||
| 165 | static int bpf_tcp_init(struct sock *sk) | 203 | static int bpf_tcp_init(struct sock *sk) |
| 166 | { | 204 | { |
| 167 | struct smap_psock *psock; | 205 | struct smap_psock *psock; |
| @@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk) | |||
| 181 | psock->save_close = sk->sk_prot->close; | 219 | psock->save_close = sk->sk_prot->close; |
| 182 | psock->sk_proto = sk->sk_prot; | 220 | psock->sk_proto = sk->sk_prot; |
| 183 | 221 | ||
| 184 | if (psock->bpf_tx_msg) { | 222 | /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ |
| 185 | tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; | 223 | if (sk->sk_family == AF_INET6 && |
| 186 | tcp_bpf_proto.sendpage = bpf_tcp_sendpage; | 224 | unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { |
| 187 | tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; | 225 | spin_lock_bh(&tcpv6_prot_lock); |
| 188 | tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; | 226 | if (likely(sk->sk_prot != saved_tcpv6_prot)) { |
| 227 | build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); | ||
| 228 | smp_store_release(&saved_tcpv6_prot, sk->sk_prot); | ||
| 229 | } | ||
| 230 | spin_unlock_bh(&tcpv6_prot_lock); | ||
| 189 | } | 231 | } |
| 190 | 232 | update_sk_prot(sk, psock); | |
| 191 | sk->sk_prot = &tcp_bpf_proto; | ||
| 192 | rcu_read_unlock(); | 233 | rcu_read_unlock(); |
| 193 | return 0; | 234 | return 0; |
| 194 | } | 235 | } |
| @@ -219,24 +260,64 @@ out: | |||
| 219 | rcu_read_unlock(); | 260 | rcu_read_unlock(); |
| 220 | } | 261 | } |
| 221 | 262 | ||
| 263 | static struct htab_elem *lookup_elem_raw(struct hlist_head *head, | ||
| 264 | u32 hash, void *key, u32 key_size) | ||
| 265 | { | ||
| 266 | struct htab_elem *l; | ||
| 267 | |||
| 268 | hlist_for_each_entry_rcu(l, head, hash_node) { | ||
| 269 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | ||
| 270 | return l; | ||
| 271 | } | ||
| 272 | |||
| 273 | return NULL; | ||
| 274 | } | ||
| 275 | |||
| 276 | static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) | ||
| 277 | { | ||
| 278 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | ||
| 279 | } | ||
| 280 | |||
| 281 | static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) | ||
| 282 | { | ||
| 283 | return &__select_bucket(htab, hash)->head; | ||
| 284 | } | ||
| 285 | |||
| 222 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | 286 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
| 223 | { | 287 | { |
| 224 | atomic_dec(&htab->count); | 288 | atomic_dec(&htab->count); |
| 225 | kfree_rcu(l, rcu); | 289 | kfree_rcu(l, rcu); |
| 226 | } | 290 | } |
| 227 | 291 | ||
| 292 | static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, | ||
| 293 | struct smap_psock *psock) | ||
| 294 | { | ||
| 295 | struct smap_psock_map_entry *e; | ||
| 296 | |||
| 297 | spin_lock_bh(&psock->maps_lock); | ||
| 298 | e = list_first_entry_or_null(&psock->maps, | ||
| 299 | struct smap_psock_map_entry, | ||
| 300 | list); | ||
| 301 | if (e) | ||
| 302 | list_del(&e->list); | ||
| 303 | spin_unlock_bh(&psock->maps_lock); | ||
| 304 | return e; | ||
| 305 | } | ||
| 306 | |||
| 228 | static void bpf_tcp_close(struct sock *sk, long timeout) | 307 | static void bpf_tcp_close(struct sock *sk, long timeout) |
| 229 | { | 308 | { |
| 230 | void (*close_fun)(struct sock *sk, long timeout); | 309 | void (*close_fun)(struct sock *sk, long timeout); |
| 231 | struct smap_psock_map_entry *e, *tmp; | 310 | struct smap_psock_map_entry *e; |
| 232 | struct sk_msg_buff *md, *mtmp; | 311 | struct sk_msg_buff *md, *mtmp; |
| 233 | struct smap_psock *psock; | 312 | struct smap_psock *psock; |
| 234 | struct sock *osk; | 313 | struct sock *osk; |
| 235 | 314 | ||
| 315 | lock_sock(sk); | ||
| 236 | rcu_read_lock(); | 316 | rcu_read_lock(); |
| 237 | psock = smap_psock_sk(sk); | 317 | psock = smap_psock_sk(sk); |
| 238 | if (unlikely(!psock)) { | 318 | if (unlikely(!psock)) { |
| 239 | rcu_read_unlock(); | 319 | rcu_read_unlock(); |
| 320 | release_sock(sk); | ||
| 240 | return sk->sk_prot->close(sk, timeout); | 321 | return sk->sk_prot->close(sk, timeout); |
| 241 | } | 322 | } |
| 242 | 323 | ||
| @@ -247,7 +328,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) | |||
| 247 | */ | 328 | */ |
| 248 | close_fun = psock->save_close; | 329 | close_fun = psock->save_close; |
| 249 | 330 | ||
| 250 | write_lock_bh(&sk->sk_callback_lock); | ||
| 251 | if (psock->cork) { | 331 | if (psock->cork) { |
| 252 | free_start_sg(psock->sock, psock->cork); | 332 | free_start_sg(psock->sock, psock->cork); |
| 253 | kfree(psock->cork); | 333 | kfree(psock->cork); |
| @@ -260,21 +340,40 @@ static void bpf_tcp_close(struct sock *sk, long timeout) | |||
| 260 | kfree(md); | 340 | kfree(md); |
| 261 | } | 341 | } |
| 262 | 342 | ||
| 263 | list_for_each_entry_safe(e, tmp, &psock->maps, list) { | 343 | e = psock_map_pop(sk, psock); |
| 344 | while (e) { | ||
| 264 | if (e->entry) { | 345 | if (e->entry) { |
| 265 | osk = cmpxchg(e->entry, sk, NULL); | 346 | osk = cmpxchg(e->entry, sk, NULL); |
| 266 | if (osk == sk) { | 347 | if (osk == sk) { |
| 267 | list_del(&e->list); | ||
| 268 | smap_release_sock(psock, sk); | 348 | smap_release_sock(psock, sk); |
| 269 | } | 349 | } |
| 270 | } else { | 350 | } else { |
| 271 | hlist_del_rcu(&e->hash_link->hash_node); | 351 | struct htab_elem *link = rcu_dereference(e->hash_link); |
| 272 | smap_release_sock(psock, e->hash_link->sk); | 352 | struct bpf_htab *htab = rcu_dereference(e->htab); |
| 273 | free_htab_elem(e->htab, e->hash_link); | 353 | struct hlist_head *head; |
| 354 | struct htab_elem *l; | ||
| 355 | struct bucket *b; | ||
| 356 | |||
| 357 | b = __select_bucket(htab, link->hash); | ||
| 358 | head = &b->head; | ||
| 359 | raw_spin_lock_bh(&b->lock); | ||
| 360 | l = lookup_elem_raw(head, | ||
| 361 | link->hash, link->key, | ||
| 362 | htab->map.key_size); | ||
| 363 | /* If another thread deleted this object skip deletion. | ||
| 364 | * The refcnt on psock may or may not be zero. | ||
| 365 | */ | ||
| 366 | if (l) { | ||
| 367 | hlist_del_rcu(&link->hash_node); | ||
| 368 | smap_release_sock(psock, link->sk); | ||
| 369 | free_htab_elem(htab, link); | ||
| 370 | } | ||
| 371 | raw_spin_unlock_bh(&b->lock); | ||
| 274 | } | 372 | } |
| 373 | e = psock_map_pop(sk, psock); | ||
| 275 | } | 374 | } |
| 276 | write_unlock_bh(&sk->sk_callback_lock); | ||
| 277 | rcu_read_unlock(); | 375 | rcu_read_unlock(); |
| 376 | release_sock(sk); | ||
| 278 | close_fun(sk, timeout); | 377 | close_fun(sk, timeout); |
| 279 | } | 378 | } |
| 280 | 379 | ||
| @@ -472,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) | |||
| 472 | while (sg[i].length) { | 571 | while (sg[i].length) { |
| 473 | free += sg[i].length; | 572 | free += sg[i].length; |
| 474 | sk_mem_uncharge(sk, sg[i].length); | 573 | sk_mem_uncharge(sk, sg[i].length); |
| 475 | put_page(sg_page(&sg[i])); | 574 | if (!md->skb) |
| 575 | put_page(sg_page(&sg[i])); | ||
| 476 | sg[i].length = 0; | 576 | sg[i].length = 0; |
| 477 | sg[i].page_link = 0; | 577 | sg[i].page_link = 0; |
| 478 | sg[i].offset = 0; | 578 | sg[i].offset = 0; |
| @@ -481,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) | |||
| 481 | if (i == MAX_SKB_FRAGS) | 581 | if (i == MAX_SKB_FRAGS) |
| 482 | i = 0; | 582 | i = 0; |
| 483 | } | 583 | } |
| 584 | if (md->skb) | ||
| 585 | consume_skb(md->skb); | ||
| 484 | 586 | ||
| 485 | return free; | 587 | return free; |
| 486 | } | 588 | } |
| @@ -1111,8 +1213,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, | |||
| 1111 | 1213 | ||
| 1112 | static int bpf_tcp_ulp_register(void) | 1214 | static int bpf_tcp_ulp_register(void) |
| 1113 | { | 1215 | { |
| 1114 | tcp_bpf_proto = tcp_prot; | 1216 | build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); |
| 1115 | tcp_bpf_proto.close = bpf_tcp_close; | ||
| 1116 | /* Once BPF TX ULP is registered it is never unregistered. It | 1217 | /* Once BPF TX ULP is registered it is never unregistered. It |
| 1117 | * will be in the ULP list for the lifetime of the system. Doing | 1218 | * will be in the ULP list for the lifetime of the system. Doing |
| 1118 | * duplicate registers is not a problem. | 1219 | * duplicate registers is not a problem. |
| @@ -1135,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) | |||
| 1135 | */ | 1236 | */ |
| 1136 | TCP_SKB_CB(skb)->bpf.sk_redir = NULL; | 1237 | TCP_SKB_CB(skb)->bpf.sk_redir = NULL; |
| 1137 | skb->sk = psock->sock; | 1238 | skb->sk = psock->sock; |
| 1138 | bpf_compute_data_pointers(skb); | 1239 | bpf_compute_data_end_sk_skb(skb); |
| 1139 | preempt_disable(); | 1240 | preempt_disable(); |
| 1140 | rc = (*prog->bpf_func)(skb, prog->insnsi); | 1241 | rc = (*prog->bpf_func)(skb, prog->insnsi); |
| 1141 | preempt_enable(); | 1242 | preempt_enable(); |
| @@ -1357,7 +1458,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) | |||
| 1357 | { | 1458 | { |
| 1358 | if (refcount_dec_and_test(&psock->refcnt)) { | 1459 | if (refcount_dec_and_test(&psock->refcnt)) { |
| 1359 | tcp_cleanup_ulp(sock); | 1460 | tcp_cleanup_ulp(sock); |
| 1461 | write_lock_bh(&sock->sk_callback_lock); | ||
| 1360 | smap_stop_sock(psock, sock); | 1462 | smap_stop_sock(psock, sock); |
| 1463 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1361 | clear_bit(SMAP_TX_RUNNING, &psock->state); | 1464 | clear_bit(SMAP_TX_RUNNING, &psock->state); |
| 1362 | rcu_assign_sk_user_data(sock, NULL); | 1465 | rcu_assign_sk_user_data(sock, NULL); |
| 1363 | call_rcu_sched(&psock->rcu, smap_destroy_psock); | 1466 | call_rcu_sched(&psock->rcu, smap_destroy_psock); |
| @@ -1388,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, | |||
| 1388 | * any socket yet. | 1491 | * any socket yet. |
| 1389 | */ | 1492 | */ |
| 1390 | skb->sk = psock->sock; | 1493 | skb->sk = psock->sock; |
| 1391 | bpf_compute_data_pointers(skb); | 1494 | bpf_compute_data_end_sk_skb(skb); |
| 1392 | rc = (*prog->bpf_func)(skb, prog->insnsi); | 1495 | rc = (*prog->bpf_func)(skb, prog->insnsi); |
| 1393 | skb->sk = NULL; | 1496 | skb->sk = NULL; |
| 1394 | rcu_read_unlock(); | 1497 | rcu_read_unlock(); |
| @@ -1508,6 +1611,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) | |||
| 1508 | INIT_LIST_HEAD(&psock->maps); | 1611 | INIT_LIST_HEAD(&psock->maps); |
| 1509 | INIT_LIST_HEAD(&psock->ingress); | 1612 | INIT_LIST_HEAD(&psock->ingress); |
| 1510 | refcount_set(&psock->refcnt, 1); | 1613 | refcount_set(&psock->refcnt, 1); |
| 1614 | spin_lock_init(&psock->maps_lock); | ||
| 1511 | 1615 | ||
| 1512 | rcu_assign_sk_user_data(sock, psock); | 1616 | rcu_assign_sk_user_data(sock, psock); |
| 1513 | sock_hold(sock); | 1617 | sock_hold(sock); |
| @@ -1564,18 +1668,32 @@ free_stab: | |||
| 1564 | return ERR_PTR(err); | 1668 | return ERR_PTR(err); |
| 1565 | } | 1669 | } |
| 1566 | 1670 | ||
| 1567 | static void smap_list_remove(struct smap_psock *psock, | 1671 | static void smap_list_map_remove(struct smap_psock *psock, |
| 1568 | struct sock **entry, | 1672 | struct sock **entry) |
| 1569 | struct htab_elem *hash_link) | ||
| 1570 | { | 1673 | { |
| 1571 | struct smap_psock_map_entry *e, *tmp; | 1674 | struct smap_psock_map_entry *e, *tmp; |
| 1572 | 1675 | ||
| 1676 | spin_lock_bh(&psock->maps_lock); | ||
| 1573 | list_for_each_entry_safe(e, tmp, &psock->maps, list) { | 1677 | list_for_each_entry_safe(e, tmp, &psock->maps, list) { |
| 1574 | if (e->entry == entry || e->hash_link == hash_link) { | 1678 | if (e->entry == entry) |
| 1679 | list_del(&e->list); | ||
| 1680 | } | ||
| 1681 | spin_unlock_bh(&psock->maps_lock); | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | static void smap_list_hash_remove(struct smap_psock *psock, | ||
| 1685 | struct htab_elem *hash_link) | ||
| 1686 | { | ||
| 1687 | struct smap_psock_map_entry *e, *tmp; | ||
| 1688 | |||
| 1689 | spin_lock_bh(&psock->maps_lock); | ||
| 1690 | list_for_each_entry_safe(e, tmp, &psock->maps, list) { | ||
| 1691 | struct htab_elem *c = rcu_dereference(e->hash_link); | ||
| 1692 | |||
| 1693 | if (c == hash_link) | ||
| 1575 | list_del(&e->list); | 1694 | list_del(&e->list); |
| 1576 | break; | ||
| 1577 | } | ||
| 1578 | } | 1695 | } |
| 1696 | spin_unlock_bh(&psock->maps_lock); | ||
| 1579 | } | 1697 | } |
| 1580 | 1698 | ||
| 1581 | static void sock_map_free(struct bpf_map *map) | 1699 | static void sock_map_free(struct bpf_map *map) |
| @@ -1601,7 +1719,6 @@ static void sock_map_free(struct bpf_map *map) | |||
| 1601 | if (!sock) | 1719 | if (!sock) |
| 1602 | continue; | 1720 | continue; |
| 1603 | 1721 | ||
| 1604 | write_lock_bh(&sock->sk_callback_lock); | ||
| 1605 | psock = smap_psock_sk(sock); | 1722 | psock = smap_psock_sk(sock); |
| 1606 | /* This check handles a racing sock event that can get the | 1723 | /* This check handles a racing sock event that can get the |
| 1607 | * sk_callback_lock before this case but after xchg happens | 1724 | * sk_callback_lock before this case but after xchg happens |
| @@ -1609,10 +1726,9 @@ static void sock_map_free(struct bpf_map *map) | |||
| 1609 | * to be null and queued for garbage collection. | 1726 | * to be null and queued for garbage collection. |
| 1610 | */ | 1727 | */ |
| 1611 | if (likely(psock)) { | 1728 | if (likely(psock)) { |
| 1612 | smap_list_remove(psock, &stab->sock_map[i], NULL); | 1729 | smap_list_map_remove(psock, &stab->sock_map[i]); |
| 1613 | smap_release_sock(psock, sock); | 1730 | smap_release_sock(psock, sock); |
| 1614 | } | 1731 | } |
| 1615 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1616 | } | 1732 | } |
| 1617 | rcu_read_unlock(); | 1733 | rcu_read_unlock(); |
| 1618 | 1734 | ||
| @@ -1661,17 +1777,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) | |||
| 1661 | if (!sock) | 1777 | if (!sock) |
| 1662 | return -EINVAL; | 1778 | return -EINVAL; |
| 1663 | 1779 | ||
| 1664 | write_lock_bh(&sock->sk_callback_lock); | ||
| 1665 | psock = smap_psock_sk(sock); | 1780 | psock = smap_psock_sk(sock); |
| 1666 | if (!psock) | 1781 | if (!psock) |
| 1667 | goto out; | 1782 | goto out; |
| 1668 | 1783 | ||
| 1669 | if (psock->bpf_parse) | 1784 | if (psock->bpf_parse) |
| 1670 | smap_stop_sock(psock, sock); | 1785 | smap_stop_sock(psock, sock); |
| 1671 | smap_list_remove(psock, &stab->sock_map[k], NULL); | 1786 | smap_list_map_remove(psock, &stab->sock_map[k]); |
| 1672 | smap_release_sock(psock, sock); | 1787 | smap_release_sock(psock, sock); |
| 1673 | out: | 1788 | out: |
| 1674 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1675 | return 0; | 1789 | return 0; |
| 1676 | } | 1790 | } |
| 1677 | 1791 | ||
| @@ -1752,7 +1866,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, | |||
| 1752 | } | 1866 | } |
| 1753 | } | 1867 | } |
| 1754 | 1868 | ||
| 1755 | write_lock_bh(&sock->sk_callback_lock); | ||
| 1756 | psock = smap_psock_sk(sock); | 1869 | psock = smap_psock_sk(sock); |
| 1757 | 1870 | ||
| 1758 | /* 2. Do not allow inheriting programs if psock exists and has | 1871 | /* 2. Do not allow inheriting programs if psock exists and has |
| @@ -1789,7 +1902,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, | |||
| 1789 | e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); | 1902 | e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); |
| 1790 | if (!e) { | 1903 | if (!e) { |
| 1791 | err = -ENOMEM; | 1904 | err = -ENOMEM; |
| 1792 | goto out_progs; | 1905 | goto out_free; |
| 1793 | } | 1906 | } |
| 1794 | } | 1907 | } |
| 1795 | 1908 | ||
| @@ -1809,7 +1922,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, | |||
| 1809 | if (err) | 1922 | if (err) |
| 1810 | goto out_free; | 1923 | goto out_free; |
| 1811 | smap_init_progs(psock, verdict, parse); | 1924 | smap_init_progs(psock, verdict, parse); |
| 1925 | write_lock_bh(&sock->sk_callback_lock); | ||
| 1812 | smap_start_sock(psock, sock); | 1926 | smap_start_sock(psock, sock); |
| 1927 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1813 | } | 1928 | } |
| 1814 | 1929 | ||
| 1815 | /* 4. Place psock in sockmap for use and stop any programs on | 1930 | /* 4. Place psock in sockmap for use and stop any programs on |
| @@ -1819,9 +1934,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, | |||
| 1819 | */ | 1934 | */ |
| 1820 | if (map_link) { | 1935 | if (map_link) { |
| 1821 | e->entry = map_link; | 1936 | e->entry = map_link; |
| 1937 | spin_lock_bh(&psock->maps_lock); | ||
| 1822 | list_add_tail(&e->list, &psock->maps); | 1938 | list_add_tail(&e->list, &psock->maps); |
| 1939 | spin_unlock_bh(&psock->maps_lock); | ||
| 1823 | } | 1940 | } |
| 1824 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1825 | return err; | 1941 | return err; |
| 1826 | out_free: | 1942 | out_free: |
| 1827 | smap_release_sock(psock, sock); | 1943 | smap_release_sock(psock, sock); |
| @@ -1832,7 +1948,6 @@ out_progs: | |||
| 1832 | } | 1948 | } |
| 1833 | if (tx_msg) | 1949 | if (tx_msg) |
| 1834 | bpf_prog_put(tx_msg); | 1950 | bpf_prog_put(tx_msg); |
| 1835 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 1836 | kfree(e); | 1951 | kfree(e); |
| 1837 | return err; | 1952 | return err; |
| 1838 | } | 1953 | } |
| @@ -1869,10 +1984,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, | |||
| 1869 | if (osock) { | 1984 | if (osock) { |
| 1870 | struct smap_psock *opsock = smap_psock_sk(osock); | 1985 | struct smap_psock *opsock = smap_psock_sk(osock); |
| 1871 | 1986 | ||
| 1872 | write_lock_bh(&osock->sk_callback_lock); | 1987 | smap_list_map_remove(opsock, &stab->sock_map[i]); |
| 1873 | smap_list_remove(opsock, &stab->sock_map[i], NULL); | ||
| 1874 | smap_release_sock(opsock, osock); | 1988 | smap_release_sock(opsock, osock); |
| 1875 | write_unlock_bh(&osock->sk_callback_lock); | ||
| 1876 | } | 1989 | } |
| 1877 | out: | 1990 | out: |
| 1878 | return err; | 1991 | return err; |
| @@ -1915,6 +2028,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) | |||
| 1915 | return 0; | 2028 | return 0; |
| 1916 | } | 2029 | } |
| 1917 | 2030 | ||
| 2031 | int sockmap_get_from_fd(const union bpf_attr *attr, int type, | ||
| 2032 | struct bpf_prog *prog) | ||
| 2033 | { | ||
| 2034 | int ufd = attr->target_fd; | ||
| 2035 | struct bpf_map *map; | ||
| 2036 | struct fd f; | ||
| 2037 | int err; | ||
| 2038 | |||
| 2039 | f = fdget(ufd); | ||
| 2040 | map = __bpf_map_get(f); | ||
| 2041 | if (IS_ERR(map)) | ||
| 2042 | return PTR_ERR(map); | ||
| 2043 | |||
| 2044 | err = sock_map_prog(map, prog, attr->attach_type); | ||
| 2045 | fdput(f); | ||
| 2046 | return err; | ||
| 2047 | } | ||
| 2048 | |||
| 1918 | static void *sock_map_lookup(struct bpf_map *map, void *key) | 2049 | static void *sock_map_lookup(struct bpf_map *map, void *key) |
| 1919 | { | 2050 | { |
| 1920 | return NULL; | 2051 | return NULL; |
| @@ -1944,7 +2075,13 @@ static int sock_map_update_elem(struct bpf_map *map, | |||
| 1944 | return -EOPNOTSUPP; | 2075 | return -EOPNOTSUPP; |
| 1945 | } | 2076 | } |
| 1946 | 2077 | ||
| 2078 | lock_sock(skops.sk); | ||
| 2079 | preempt_disable(); | ||
| 2080 | rcu_read_lock(); | ||
| 1947 | err = sock_map_ctx_update_elem(&skops, map, key, flags); | 2081 | err = sock_map_ctx_update_elem(&skops, map, key, flags); |
| 2082 | rcu_read_unlock(); | ||
| 2083 | preempt_enable(); | ||
| 2084 | release_sock(skops.sk); | ||
| 1948 | fput(socket->file); | 2085 | fput(socket->file); |
| 1949 | return err; | 2086 | return err; |
| 1950 | } | 2087 | } |
| @@ -2043,14 +2180,13 @@ free_htab: | |||
| 2043 | return ERR_PTR(err); | 2180 | return ERR_PTR(err); |
| 2044 | } | 2181 | } |
| 2045 | 2182 | ||
| 2046 | static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) | 2183 | static void __bpf_htab_free(struct rcu_head *rcu) |
| 2047 | { | 2184 | { |
| 2048 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | 2185 | struct bpf_htab *htab; |
| 2049 | } | ||
| 2050 | 2186 | ||
| 2051 | static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) | 2187 | htab = container_of(rcu, struct bpf_htab, rcu); |
| 2052 | { | 2188 | bpf_map_area_free(htab->buckets); |
| 2053 | return &__select_bucket(htab, hash)->head; | 2189 | kfree(htab); |
| 2054 | } | 2190 | } |
| 2055 | 2191 | ||
| 2056 | static void sock_hash_free(struct bpf_map *map) | 2192 | static void sock_hash_free(struct bpf_map *map) |
| @@ -2069,16 +2205,18 @@ static void sock_hash_free(struct bpf_map *map) | |||
| 2069 | */ | 2205 | */ |
| 2070 | rcu_read_lock(); | 2206 | rcu_read_lock(); |
| 2071 | for (i = 0; i < htab->n_buckets; i++) { | 2207 | for (i = 0; i < htab->n_buckets; i++) { |
| 2072 | struct hlist_head *head = select_bucket(htab, i); | 2208 | struct bucket *b = __select_bucket(htab, i); |
| 2209 | struct hlist_head *head; | ||
| 2073 | struct hlist_node *n; | 2210 | struct hlist_node *n; |
| 2074 | struct htab_elem *l; | 2211 | struct htab_elem *l; |
| 2075 | 2212 | ||
| 2213 | raw_spin_lock_bh(&b->lock); | ||
| 2214 | head = &b->head; | ||
| 2076 | hlist_for_each_entry_safe(l, n, head, hash_node) { | 2215 | hlist_for_each_entry_safe(l, n, head, hash_node) { |
| 2077 | struct sock *sock = l->sk; | 2216 | struct sock *sock = l->sk; |
| 2078 | struct smap_psock *psock; | 2217 | struct smap_psock *psock; |
| 2079 | 2218 | ||
| 2080 | hlist_del_rcu(&l->hash_node); | 2219 | hlist_del_rcu(&l->hash_node); |
| 2081 | write_lock_bh(&sock->sk_callback_lock); | ||
| 2082 | psock = smap_psock_sk(sock); | 2220 | psock = smap_psock_sk(sock); |
| 2083 | /* This check handles a racing sock event that can get | 2221 | /* This check handles a racing sock event that can get |
| 2084 | * the sk_callback_lock before this case but after xchg | 2222 | * the sk_callback_lock before this case but after xchg |
| @@ -2086,16 +2224,15 @@ static void sock_hash_free(struct bpf_map *map) | |||
| 2086 | * (psock) to be null and queued for garbage collection. | 2224 | * (psock) to be null and queued for garbage collection. |
| 2087 | */ | 2225 | */ |
| 2088 | if (likely(psock)) { | 2226 | if (likely(psock)) { |
| 2089 | smap_list_remove(psock, NULL, l); | 2227 | smap_list_hash_remove(psock, l); |
| 2090 | smap_release_sock(psock, sock); | 2228 | smap_release_sock(psock, sock); |
| 2091 | } | 2229 | } |
| 2092 | write_unlock_bh(&sock->sk_callback_lock); | 2230 | free_htab_elem(htab, l); |
| 2093 | kfree(l); | ||
| 2094 | } | 2231 | } |
| 2232 | raw_spin_unlock_bh(&b->lock); | ||
| 2095 | } | 2233 | } |
| 2096 | rcu_read_unlock(); | 2234 | rcu_read_unlock(); |
| 2097 | bpf_map_area_free(htab->buckets); | 2235 | call_rcu(&htab->rcu, __bpf_htab_free); |
| 2098 | kfree(htab); | ||
| 2099 | } | 2236 | } |
| 2100 | 2237 | ||
| 2101 | static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, | 2238 | static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, |
| @@ -2122,19 +2259,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, | |||
| 2122 | return l_new; | 2259 | return l_new; |
| 2123 | } | 2260 | } |
| 2124 | 2261 | ||
| 2125 | static struct htab_elem *lookup_elem_raw(struct hlist_head *head, | ||
| 2126 | u32 hash, void *key, u32 key_size) | ||
| 2127 | { | ||
| 2128 | struct htab_elem *l; | ||
| 2129 | |||
| 2130 | hlist_for_each_entry_rcu(l, head, hash_node) { | ||
| 2131 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | ||
| 2132 | return l; | ||
| 2133 | } | ||
| 2134 | |||
| 2135 | return NULL; | ||
| 2136 | } | ||
| 2137 | |||
| 2138 | static inline u32 htab_map_hash(const void *key, u32 key_len) | 2262 | static inline u32 htab_map_hash(const void *key, u32 key_len) |
| 2139 | { | 2263 | { |
| 2140 | return jhash(key, key_len, 0); | 2264 | return jhash(key, key_len, 0); |
| @@ -2230,7 +2354,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, | |||
| 2230 | if (err) | 2354 | if (err) |
| 2231 | goto err; | 2355 | goto err; |
| 2232 | 2356 | ||
| 2233 | /* bpf_map_update_elem() can be called in_irq() */ | 2357 | /* psock is valid here because otherwise above *ctx_update_elem would |
| 2358 | * have thrown an error. It is safe to skip error check. | ||
| 2359 | */ | ||
| 2360 | psock = smap_psock_sk(sock); | ||
| 2234 | raw_spin_lock_bh(&b->lock); | 2361 | raw_spin_lock_bh(&b->lock); |
| 2235 | l_old = lookup_elem_raw(head, hash, key, key_size); | 2362 | l_old = lookup_elem_raw(head, hash, key, key_size); |
| 2236 | if (l_old && map_flags == BPF_NOEXIST) { | 2363 | if (l_old && map_flags == BPF_NOEXIST) { |
| @@ -2248,15 +2375,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, | |||
| 2248 | goto bucket_err; | 2375 | goto bucket_err; |
| 2249 | } | 2376 | } |
| 2250 | 2377 | ||
| 2251 | psock = smap_psock_sk(sock); | 2378 | rcu_assign_pointer(e->hash_link, l_new); |
| 2252 | if (unlikely(!psock)) { | 2379 | rcu_assign_pointer(e->htab, |
| 2253 | err = -EINVAL; | 2380 | container_of(map, struct bpf_htab, map)); |
| 2254 | goto bucket_err; | 2381 | spin_lock_bh(&psock->maps_lock); |
| 2255 | } | ||
| 2256 | |||
| 2257 | e->hash_link = l_new; | ||
| 2258 | e->htab = container_of(map, struct bpf_htab, map); | ||
| 2259 | list_add_tail(&e->list, &psock->maps); | 2382 | list_add_tail(&e->list, &psock->maps); |
| 2383 | spin_unlock_bh(&psock->maps_lock); | ||
| 2260 | 2384 | ||
| 2261 | /* add new element to the head of the list, so that | 2385 | /* add new element to the head of the list, so that |
| 2262 | * concurrent search will find it before old elem | 2386 | * concurrent search will find it before old elem |
| @@ -2266,19 +2390,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, | |||
| 2266 | psock = smap_psock_sk(l_old->sk); | 2390 | psock = smap_psock_sk(l_old->sk); |
| 2267 | 2391 | ||
| 2268 | hlist_del_rcu(&l_old->hash_node); | 2392 | hlist_del_rcu(&l_old->hash_node); |
| 2269 | smap_list_remove(psock, NULL, l_old); | 2393 | smap_list_hash_remove(psock, l_old); |
| 2270 | smap_release_sock(psock, l_old->sk); | 2394 | smap_release_sock(psock, l_old->sk); |
| 2271 | free_htab_elem(htab, l_old); | 2395 | free_htab_elem(htab, l_old); |
| 2272 | } | 2396 | } |
| 2273 | raw_spin_unlock_bh(&b->lock); | 2397 | raw_spin_unlock_bh(&b->lock); |
| 2274 | return 0; | 2398 | return 0; |
| 2275 | bucket_err: | 2399 | bucket_err: |
| 2400 | smap_release_sock(psock, sock); | ||
| 2276 | raw_spin_unlock_bh(&b->lock); | 2401 | raw_spin_unlock_bh(&b->lock); |
| 2277 | err: | 2402 | err: |
| 2278 | kfree(e); | 2403 | kfree(e); |
| 2279 | psock = smap_psock_sk(sock); | ||
| 2280 | if (psock) | ||
| 2281 | smap_release_sock(psock, sock); | ||
| 2282 | return err; | 2404 | return err; |
| 2283 | } | 2405 | } |
| 2284 | 2406 | ||
| @@ -2300,7 +2422,13 @@ static int sock_hash_update_elem(struct bpf_map *map, | |||
| 2300 | return -EINVAL; | 2422 | return -EINVAL; |
| 2301 | } | 2423 | } |
| 2302 | 2424 | ||
| 2425 | lock_sock(skops.sk); | ||
| 2426 | preempt_disable(); | ||
| 2427 | rcu_read_lock(); | ||
| 2303 | err = sock_hash_ctx_update_elem(&skops, map, key, flags); | 2428 | err = sock_hash_ctx_update_elem(&skops, map, key, flags); |
| 2429 | rcu_read_unlock(); | ||
| 2430 | preempt_enable(); | ||
| 2431 | release_sock(skops.sk); | ||
| 2304 | fput(socket->file); | 2432 | fput(socket->file); |
| 2305 | return err; | 2433 | return err; |
| 2306 | } | 2434 | } |
| @@ -2326,7 +2454,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) | |||
| 2326 | struct smap_psock *psock; | 2454 | struct smap_psock *psock; |
| 2327 | 2455 | ||
| 2328 | hlist_del_rcu(&l->hash_node); | 2456 | hlist_del_rcu(&l->hash_node); |
| 2329 | write_lock_bh(&sock->sk_callback_lock); | ||
| 2330 | psock = smap_psock_sk(sock); | 2457 | psock = smap_psock_sk(sock); |
| 2331 | /* This check handles a racing sock event that can get the | 2458 | /* This check handles a racing sock event that can get the |
| 2332 | * sk_callback_lock before this case but after xchg happens | 2459 | * sk_callback_lock before this case but after xchg happens |
| @@ -2334,10 +2461,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) | |||
| 2334 | * to be null and queued for garbage collection. | 2461 | * to be null and queued for garbage collection. |
| 2335 | */ | 2462 | */ |
| 2336 | if (likely(psock)) { | 2463 | if (likely(psock)) { |
| 2337 | smap_list_remove(psock, NULL, l); | 2464 | smap_list_hash_remove(psock, l); |
| 2338 | smap_release_sock(psock, sock); | 2465 | smap_release_sock(psock, sock); |
| 2339 | } | 2466 | } |
| 2340 | write_unlock_bh(&sock->sk_callback_lock); | ||
| 2341 | free_htab_elem(htab, l); | 2467 | free_htab_elem(htab, l); |
| 2342 | ret = 0; | 2468 | ret = 0; |
| 2343 | } | 2469 | } |
| @@ -2359,10 +2485,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) | |||
| 2359 | b = __select_bucket(htab, hash); | 2485 | b = __select_bucket(htab, hash); |
| 2360 | head = &b->head; | 2486 | head = &b->head; |
| 2361 | 2487 | ||
| 2362 | raw_spin_lock_bh(&b->lock); | ||
| 2363 | l = lookup_elem_raw(head, hash, key, key_size); | 2488 | l = lookup_elem_raw(head, hash, key, key_size); |
| 2364 | sk = l ? l->sk : NULL; | 2489 | sk = l ? l->sk : NULL; |
| 2365 | raw_spin_unlock_bh(&b->lock); | ||
| 2366 | return sk; | 2490 | return sk; |
| 2367 | } | 2491 | } |
| 2368 | 2492 | ||
| @@ -2383,6 +2507,7 @@ const struct bpf_map_ops sock_hash_ops = { | |||
| 2383 | .map_get_next_key = sock_hash_get_next_key, | 2507 | .map_get_next_key = sock_hash_get_next_key, |
| 2384 | .map_update_elem = sock_hash_update_elem, | 2508 | .map_update_elem = sock_hash_update_elem, |
| 2385 | .map_delete_elem = sock_hash_delete_elem, | 2509 | .map_delete_elem = sock_hash_delete_elem, |
| 2510 | .map_release_uref = sock_map_release, | ||
| 2386 | }; | 2511 | }; |
| 2387 | 2512 | ||
| 2388 | BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, | 2513 | BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..a31a1ba0f8ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 735 | if (bpf_map_is_dev_bound(map)) { | 735 | if (bpf_map_is_dev_bound(map)) { |
| 736 | err = bpf_map_offload_update_elem(map, key, value, attr->flags); | 736 | err = bpf_map_offload_update_elem(map, key, value, attr->flags); |
| 737 | goto out; | 737 | goto out; |
| 738 | } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { | 738 | } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || |
| 739 | map->map_type == BPF_MAP_TYPE_SOCKHASH || | ||
| 740 | map->map_type == BPF_MAP_TYPE_SOCKMAP) { | ||
| 739 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 741 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
| 740 | goto out; | 742 | goto out; |
| 741 | } | 743 | } |
| @@ -1483,8 +1485,6 @@ out_free_tp: | |||
| 1483 | return err; | 1485 | return err; |
| 1484 | } | 1486 | } |
| 1485 | 1487 | ||
| 1486 | #ifdef CONFIG_CGROUP_BPF | ||
| 1487 | |||
| 1488 | static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, | 1488 | static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, |
| 1489 | enum bpf_attach_type attach_type) | 1489 | enum bpf_attach_type attach_type) |
| 1490 | { | 1490 | { |
| @@ -1499,40 +1499,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, | |||
| 1499 | 1499 | ||
| 1500 | #define BPF_PROG_ATTACH_LAST_FIELD attach_flags | 1500 | #define BPF_PROG_ATTACH_LAST_FIELD attach_flags |
| 1501 | 1501 | ||
| 1502 | static int sockmap_get_from_fd(const union bpf_attr *attr, | ||
| 1503 | int type, bool attach) | ||
| 1504 | { | ||
| 1505 | struct bpf_prog *prog = NULL; | ||
| 1506 | int ufd = attr->target_fd; | ||
| 1507 | struct bpf_map *map; | ||
| 1508 | struct fd f; | ||
| 1509 | int err; | ||
| 1510 | |||
| 1511 | f = fdget(ufd); | ||
| 1512 | map = __bpf_map_get(f); | ||
| 1513 | if (IS_ERR(map)) | ||
| 1514 | return PTR_ERR(map); | ||
| 1515 | |||
| 1516 | if (attach) { | ||
| 1517 | prog = bpf_prog_get_type(attr->attach_bpf_fd, type); | ||
| 1518 | if (IS_ERR(prog)) { | ||
| 1519 | fdput(f); | ||
| 1520 | return PTR_ERR(prog); | ||
| 1521 | } | ||
| 1522 | } | ||
| 1523 | |||
| 1524 | err = sock_map_prog(map, prog, attr->attach_type); | ||
| 1525 | if (err) { | ||
| 1526 | fdput(f); | ||
| 1527 | if (prog) | ||
| 1528 | bpf_prog_put(prog); | ||
| 1529 | return err; | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | fdput(f); | ||
| 1533 | return 0; | ||
| 1534 | } | ||
| 1535 | |||
| 1536 | #define BPF_F_ATTACH_MASK \ | 1502 | #define BPF_F_ATTACH_MASK \ |
| 1537 | (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) | 1503 | (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) |
| 1538 | 1504 | ||
| @@ -1540,7 +1506,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1540 | { | 1506 | { |
| 1541 | enum bpf_prog_type ptype; | 1507 | enum bpf_prog_type ptype; |
| 1542 | struct bpf_prog *prog; | 1508 | struct bpf_prog *prog; |
| 1543 | struct cgroup *cgrp; | ||
| 1544 | int ret; | 1509 | int ret; |
| 1545 | 1510 | ||
| 1546 | if (!capable(CAP_NET_ADMIN)) | 1511 | if (!capable(CAP_NET_ADMIN)) |
| @@ -1577,12 +1542,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1577 | ptype = BPF_PROG_TYPE_CGROUP_DEVICE; | 1542 | ptype = BPF_PROG_TYPE_CGROUP_DEVICE; |
| 1578 | break; | 1543 | break; |
| 1579 | case BPF_SK_MSG_VERDICT: | 1544 | case BPF_SK_MSG_VERDICT: |
| 1580 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); | 1545 | ptype = BPF_PROG_TYPE_SK_MSG; |
| 1546 | break; | ||
| 1581 | case BPF_SK_SKB_STREAM_PARSER: | 1547 | case BPF_SK_SKB_STREAM_PARSER: |
| 1582 | case BPF_SK_SKB_STREAM_VERDICT: | 1548 | case BPF_SK_SKB_STREAM_VERDICT: |
| 1583 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); | 1549 | ptype = BPF_PROG_TYPE_SK_SKB; |
| 1550 | break; | ||
| 1584 | case BPF_LIRC_MODE2: | 1551 | case BPF_LIRC_MODE2: |
| 1585 | return lirc_prog_attach(attr); | 1552 | ptype = BPF_PROG_TYPE_LIRC_MODE2; |
| 1553 | break; | ||
| 1586 | default: | 1554 | default: |
| 1587 | return -EINVAL; | 1555 | return -EINVAL; |
| 1588 | } | 1556 | } |
| @@ -1596,18 +1564,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1596 | return -EINVAL; | 1564 | return -EINVAL; |
| 1597 | } | 1565 | } |
| 1598 | 1566 | ||
| 1599 | cgrp = cgroup_get_from_fd(attr->target_fd); | 1567 | switch (ptype) { |
| 1600 | if (IS_ERR(cgrp)) { | 1568 | case BPF_PROG_TYPE_SK_SKB: |
| 1601 | bpf_prog_put(prog); | 1569 | case BPF_PROG_TYPE_SK_MSG: |
| 1602 | return PTR_ERR(cgrp); | 1570 | ret = sockmap_get_from_fd(attr, ptype, prog); |
| 1571 | break; | ||
| 1572 | case BPF_PROG_TYPE_LIRC_MODE2: | ||
| 1573 | ret = lirc_prog_attach(attr, prog); | ||
| 1574 | break; | ||
| 1575 | default: | ||
| 1576 | ret = cgroup_bpf_prog_attach(attr, ptype, prog); | ||
| 1603 | } | 1577 | } |
| 1604 | 1578 | ||
| 1605 | ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, | ||
| 1606 | attr->attach_flags); | ||
| 1607 | if (ret) | 1579 | if (ret) |
| 1608 | bpf_prog_put(prog); | 1580 | bpf_prog_put(prog); |
| 1609 | cgroup_put(cgrp); | ||
| 1610 | |||
| 1611 | return ret; | 1581 | return ret; |
| 1612 | } | 1582 | } |
| 1613 | 1583 | ||
| @@ -1616,9 +1586,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1616 | static int bpf_prog_detach(const union bpf_attr *attr) | 1586 | static int bpf_prog_detach(const union bpf_attr *attr) |
| 1617 | { | 1587 | { |
| 1618 | enum bpf_prog_type ptype; | 1588 | enum bpf_prog_type ptype; |
| 1619 | struct bpf_prog *prog; | ||
| 1620 | struct cgroup *cgrp; | ||
| 1621 | int ret; | ||
| 1622 | 1589 | ||
| 1623 | if (!capable(CAP_NET_ADMIN)) | 1590 | if (!capable(CAP_NET_ADMIN)) |
| 1624 | return -EPERM; | 1591 | return -EPERM; |
| @@ -1651,29 +1618,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 1651 | ptype = BPF_PROG_TYPE_CGROUP_DEVICE; | 1618 | ptype = BPF_PROG_TYPE_CGROUP_DEVICE; |
| 1652 | break; | 1619 | break; |
| 1653 | case BPF_SK_MSG_VERDICT: | 1620 | case BPF_SK_MSG_VERDICT: |
| 1654 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); | 1621 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); |
| 1655 | case BPF_SK_SKB_STREAM_PARSER: | 1622 | case BPF_SK_SKB_STREAM_PARSER: |
| 1656 | case BPF_SK_SKB_STREAM_VERDICT: | 1623 | case BPF_SK_SKB_STREAM_VERDICT: |
| 1657 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); | 1624 | return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); |
| 1658 | case BPF_LIRC_MODE2: | 1625 | case BPF_LIRC_MODE2: |
| 1659 | return lirc_prog_detach(attr); | 1626 | return lirc_prog_detach(attr); |
| 1660 | default: | 1627 | default: |
| 1661 | return -EINVAL; | 1628 | return -EINVAL; |
| 1662 | } | 1629 | } |
| 1663 | 1630 | ||
| 1664 | cgrp = cgroup_get_from_fd(attr->target_fd); | 1631 | return cgroup_bpf_prog_detach(attr, ptype); |
| 1665 | if (IS_ERR(cgrp)) | ||
| 1666 | return PTR_ERR(cgrp); | ||
| 1667 | |||
| 1668 | prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); | ||
| 1669 | if (IS_ERR(prog)) | ||
| 1670 | prog = NULL; | ||
| 1671 | |||
| 1672 | ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); | ||
| 1673 | if (prog) | ||
| 1674 | bpf_prog_put(prog); | ||
| 1675 | cgroup_put(cgrp); | ||
| 1676 | return ret; | ||
| 1677 | } | 1632 | } |
| 1678 | 1633 | ||
| 1679 | #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt | 1634 | #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt |
| @@ -1681,9 +1636,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 1681 | static int bpf_prog_query(const union bpf_attr *attr, | 1636 | static int bpf_prog_query(const union bpf_attr *attr, |
| 1682 | union bpf_attr __user *uattr) | 1637 | union bpf_attr __user *uattr) |
| 1683 | { | 1638 | { |
| 1684 | struct cgroup *cgrp; | ||
| 1685 | int ret; | ||
| 1686 | |||
| 1687 | if (!capable(CAP_NET_ADMIN)) | 1639 | if (!capable(CAP_NET_ADMIN)) |
| 1688 | return -EPERM; | 1640 | return -EPERM; |
| 1689 | if (CHECK_ATTR(BPF_PROG_QUERY)) | 1641 | if (CHECK_ATTR(BPF_PROG_QUERY)) |
| @@ -1711,14 +1663,9 @@ static int bpf_prog_query(const union bpf_attr *attr, | |||
| 1711 | default: | 1663 | default: |
| 1712 | return -EINVAL; | 1664 | return -EINVAL; |
| 1713 | } | 1665 | } |
| 1714 | cgrp = cgroup_get_from_fd(attr->query.target_fd); | 1666 | |
| 1715 | if (IS_ERR(cgrp)) | 1667 | return cgroup_bpf_prog_query(attr, uattr); |
| 1716 | return PTR_ERR(cgrp); | ||
| 1717 | ret = cgroup_bpf_query(cgrp, attr, uattr); | ||
| 1718 | cgroup_put(cgrp); | ||
| 1719 | return ret; | ||
| 1720 | } | 1668 | } |
| 1721 | #endif /* CONFIG_CGROUP_BPF */ | ||
| 1722 | 1669 | ||
| 1723 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration | 1670 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration |
| 1724 | 1671 | ||
| @@ -2365,7 +2312,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 2365 | case BPF_OBJ_GET: | 2312 | case BPF_OBJ_GET: |
| 2366 | err = bpf_obj_get(&attr); | 2313 | err = bpf_obj_get(&attr); |
| 2367 | break; | 2314 | break; |
| 2368 | #ifdef CONFIG_CGROUP_BPF | ||
| 2369 | case BPF_PROG_ATTACH: | 2315 | case BPF_PROG_ATTACH: |
| 2370 | err = bpf_prog_attach(&attr); | 2316 | err = bpf_prog_attach(&attr); |
| 2371 | break; | 2317 | break; |
| @@ -2375,7 +2321,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 2375 | case BPF_PROG_QUERY: | 2321 | case BPF_PROG_QUERY: |
| 2376 | err = bpf_prog_query(&attr, uattr); | 2322 | err = bpf_prog_query(&attr, uattr); |
| 2377 | break; | 2323 | break; |
| 2378 | #endif | ||
| 2379 | case BPF_PROG_TEST_RUN: | 2324 | case BPF_PROG_TEST_RUN: |
| 2380 | err = bpf_prog_test_run(&attr, uattr); | 2325 | err = bpf_prog_test_run(&attr, uattr); |
| 2381 | break; | 2326 | break; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..63aaac52a265 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) | |||
| 5430 | if (insn->code != (BPF_JMP | BPF_CALL) || | 5430 | if (insn->code != (BPF_JMP | BPF_CALL) || |
| 5431 | insn->src_reg != BPF_PSEUDO_CALL) | 5431 | insn->src_reg != BPF_PSEUDO_CALL) |
| 5432 | continue; | 5432 | continue; |
| 5433 | /* Upon error here we cannot fall back to interpreter but | ||
| 5434 | * need a hard reject of the program. Thus -EFAULT is | ||
| 5435 | * propagated in any case. | ||
| 5436 | */ | ||
| 5433 | subprog = find_subprog(env, i + insn->imm + 1); | 5437 | subprog = find_subprog(env, i + insn->imm + 1); |
| 5434 | if (subprog < 0) { | 5438 | if (subprog < 0) { |
| 5435 | WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", | 5439 | WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", |
| @@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) | |||
| 5450 | 5454 | ||
| 5451 | func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); | 5455 | func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); |
| 5452 | if (!func) | 5456 | if (!func) |
| 5453 | return -ENOMEM; | 5457 | goto out_undo_insn; |
| 5454 | 5458 | ||
| 5455 | for (i = 0; i < env->subprog_cnt; i++) { | 5459 | for (i = 0; i < env->subprog_cnt; i++) { |
| 5456 | subprog_start = subprog_end; | 5460 | subprog_start = subprog_end; |
| @@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) | |||
| 5515 | tmp = bpf_int_jit_compile(func[i]); | 5519 | tmp = bpf_int_jit_compile(func[i]); |
| 5516 | if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { | 5520 | if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { |
| 5517 | verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); | 5521 | verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); |
| 5518 | err = -EFAULT; | 5522 | err = -ENOTSUPP; |
| 5519 | goto out_free; | 5523 | goto out_free; |
| 5520 | } | 5524 | } |
| 5521 | cond_resched(); | 5525 | cond_resched(); |
| @@ -5552,6 +5556,7 @@ out_free: | |||
| 5552 | if (func[i]) | 5556 | if (func[i]) |
| 5553 | bpf_jit_free(func[i]); | 5557 | bpf_jit_free(func[i]); |
| 5554 | kfree(func); | 5558 | kfree(func); |
| 5559 | out_undo_insn: | ||
| 5555 | /* cleanup main prog to be interpreted */ | 5560 | /* cleanup main prog to be interpreted */ |
| 5556 | prog->jit_requested = 0; | 5561 | prog->jit_requested = 0; |
| 5557 | for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { | 5562 | for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { |
| @@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) | |||
| 5578 | err = jit_subprogs(env); | 5583 | err = jit_subprogs(env); |
| 5579 | if (err == 0) | 5584 | if (err == 0) |
| 5580 | return 0; | 5585 | return 0; |
| 5586 | if (err == -EFAULT) | ||
| 5587 | return err; | ||
| 5581 | } | 5588 | } |
| 5582 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | 5589 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON |
| 5583 | for (i = 0; i < prog->len; i++, insn++) { | 5590 | for (i = 0; i < prog->len; i++, insn++) { |
diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..a191c05e757d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -303,11 +303,38 @@ struct kmem_cache *files_cachep; | |||
| 303 | struct kmem_cache *fs_cachep; | 303 | struct kmem_cache *fs_cachep; |
| 304 | 304 | ||
| 305 | /* SLAB cache for vm_area_struct structures */ | 305 | /* SLAB cache for vm_area_struct structures */ |
| 306 | struct kmem_cache *vm_area_cachep; | 306 | static struct kmem_cache *vm_area_cachep; |
| 307 | 307 | ||
| 308 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 308 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
| 309 | static struct kmem_cache *mm_cachep; | 309 | static struct kmem_cache *mm_cachep; |
| 310 | 310 | ||
| 311 | struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) | ||
| 312 | { | ||
| 313 | struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | ||
| 314 | |||
| 315 | if (vma) { | ||
| 316 | vma->vm_mm = mm; | ||
| 317 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 318 | } | ||
| 319 | return vma; | ||
| 320 | } | ||
| 321 | |||
| 322 | struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) | ||
| 323 | { | ||
| 324 | struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | ||
| 325 | |||
| 326 | if (new) { | ||
| 327 | *new = *orig; | ||
| 328 | INIT_LIST_HEAD(&new->anon_vma_chain); | ||
| 329 | } | ||
| 330 | return new; | ||
| 331 | } | ||
| 332 | |||
| 333 | void vm_area_free(struct vm_area_struct *vma) | ||
| 334 | { | ||
| 335 | kmem_cache_free(vm_area_cachep, vma); | ||
| 336 | } | ||
| 337 | |||
| 311 | static void account_kernel_stack(struct task_struct *tsk, int account) | 338 | static void account_kernel_stack(struct task_struct *tsk, int account) |
| 312 | { | 339 | { |
| 313 | void *stack = task_stack_page(tsk); | 340 | void *stack = task_stack_page(tsk); |
| @@ -455,11 +482,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, | |||
| 455 | goto fail_nomem; | 482 | goto fail_nomem; |
| 456 | charge = len; | 483 | charge = len; |
| 457 | } | 484 | } |
| 458 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 485 | tmp = vm_area_dup(mpnt); |
| 459 | if (!tmp) | 486 | if (!tmp) |
| 460 | goto fail_nomem; | 487 | goto fail_nomem; |
| 461 | *tmp = *mpnt; | ||
| 462 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | ||
| 463 | retval = vma_dup_policy(mpnt, tmp); | 488 | retval = vma_dup_policy(mpnt, tmp); |
| 464 | if (retval) | 489 | if (retval) |
| 465 | goto fail_nomem_policy; | 490 | goto fail_nomem_policy; |
| @@ -539,7 +564,7 @@ fail_uprobe_end: | |||
| 539 | fail_nomem_anon_vma_fork: | 564 | fail_nomem_anon_vma_fork: |
| 540 | mpol_put(vma_policy(tmp)); | 565 | mpol_put(vma_policy(tmp)); |
| 541 | fail_nomem_policy: | 566 | fail_nomem_policy: |
| 542 | kmem_cache_free(vm_area_cachep, tmp); | 567 | vm_area_free(tmp); |
| 543 | fail_nomem: | 568 | fail_nomem: |
| 544 | retval = -ENOMEM; | 569 | retval = -ENOMEM; |
| 545 | vm_unacct_memory(charge); | 570 | vm_unacct_memory(charge); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..750cb8082694 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) | |||
| 177 | static void __kthread_parkme(struct kthread *self) | 177 | static void __kthread_parkme(struct kthread *self) |
| 178 | { | 178 | { |
| 179 | for (;;) { | 179 | for (;;) { |
| 180 | set_current_state(TASK_PARKED); | 180 | /* |
| 181 | * TASK_PARKED is a special state; we must serialize against | ||
| 182 | * possible pending wakeups to avoid store-store collisions on | ||
| 183 | * task->state. | ||
| 184 | * | ||
| 185 | * Such a collision might possibly result in the task state | ||
| 186 | * changin from TASK_PARKED and us failing the | ||
| 187 | * wait_task_inactive() in kthread_park(). | ||
| 188 | */ | ||
| 189 | set_special_state(TASK_PARKED); | ||
| 181 | if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) | 190 | if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) |
| 182 | break; | 191 | break; |
| 192 | |||
| 193 | complete_all(&self->parked); | ||
| 183 | schedule(); | 194 | schedule(); |
| 184 | } | 195 | } |
| 185 | __set_current_state(TASK_RUNNING); | 196 | __set_current_state(TASK_RUNNING); |
| @@ -191,11 +202,6 @@ void kthread_parkme(void) | |||
| 191 | } | 202 | } |
| 192 | EXPORT_SYMBOL_GPL(kthread_parkme); | 203 | EXPORT_SYMBOL_GPL(kthread_parkme); |
| 193 | 204 | ||
| 194 | void kthread_park_complete(struct task_struct *k) | ||
| 195 | { | ||
| 196 | complete_all(&to_kthread(k)->parked); | ||
| 197 | } | ||
| 198 | |||
| 199 | static int kthread(void *_create) | 205 | static int kthread(void *_create) |
| 200 | { | 206 | { |
| 201 | /* Copy data: it's on kthread's stack */ | 207 | /* Copy data: it's on kthread's stack */ |
| @@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k) | |||
| 461 | 467 | ||
| 462 | reinit_completion(&kthread->parked); | 468 | reinit_completion(&kthread->parked); |
| 463 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 469 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
| 470 | /* | ||
| 471 | * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. | ||
| 472 | */ | ||
| 464 | wake_up_state(k, TASK_PARKED); | 473 | wake_up_state(k, TASK_PARKED); |
| 465 | } | 474 | } |
| 466 | EXPORT_SYMBOL_GPL(kthread_unpark); | 475 | EXPORT_SYMBOL_GPL(kthread_unpark); |
| @@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k) | |||
| 487 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 496 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
| 488 | if (k != current) { | 497 | if (k != current) { |
| 489 | wake_up_process(k); | 498 | wake_up_process(k); |
| 499 | /* | ||
| 500 | * Wait for __kthread_parkme() to complete(), this means we | ||
| 501 | * _will_ have TASK_PARKED and are about to call schedule(). | ||
| 502 | */ | ||
| 490 | wait_for_completion(&kthread->parked); | 503 | wait_for_completion(&kthread->parked); |
| 504 | /* | ||
| 505 | * Now wait for that schedule() to complete and the task to | ||
| 506 | * get scheduled out. | ||
| 507 | */ | ||
| 508 | WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); | ||
| 491 | } | 509 | } |
| 492 | 510 | ||
| 493 | return 0; | 511 | return 0; |
diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c | |||
| @@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t) | |||
| 85 | { | 85 | { |
| 86 | u32 cpu_id = raw_smp_processor_id(); | 86 | u32 cpu_id = raw_smp_processor_id(); |
| 87 | 87 | ||
| 88 | if (__put_user(cpu_id, &t->rseq->cpu_id_start)) | 88 | if (put_user(cpu_id, &t->rseq->cpu_id_start)) |
| 89 | return -EFAULT; | 89 | return -EFAULT; |
| 90 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | 90 | if (put_user(cpu_id, &t->rseq->cpu_id)) |
| 91 | return -EFAULT; | 91 | return -EFAULT; |
| 92 | trace_rseq_update(t); | 92 | trace_rseq_update(t); |
| 93 | return 0; | 93 | return 0; |
| @@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) | |||
| 100 | /* | 100 | /* |
| 101 | * Reset cpu_id_start to its initial state (0). | 101 | * Reset cpu_id_start to its initial state (0). |
| 102 | */ | 102 | */ |
| 103 | if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) | 103 | if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) |
| 104 | return -EFAULT; | 104 | return -EFAULT; |
| 105 | /* | 105 | /* |
| 106 | * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming | 106 | * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming |
| 107 | * in after unregistration can figure out that rseq needs to be | 107 | * in after unregistration can figure out that rseq needs to be |
| 108 | * registered again. | 108 | * registered again. |
| 109 | */ | 109 | */ |
| 110 | if (__put_user(cpu_id, &t->rseq->cpu_id)) | 110 | if (put_user(cpu_id, &t->rseq->cpu_id)) |
| 111 | return -EFAULT; | 111 | return -EFAULT; |
| 112 | return 0; | 112 | return 0; |
| 113 | } | 113 | } |
| @@ -115,29 +115,36 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) | |||
| 115 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) | 115 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) |
| 116 | { | 116 | { |
| 117 | struct rseq_cs __user *urseq_cs; | 117 | struct rseq_cs __user *urseq_cs; |
| 118 | unsigned long ptr; | 118 | u64 ptr; |
| 119 | u32 __user *usig; | 119 | u32 __user *usig; |
| 120 | u32 sig; | 120 | u32 sig; |
| 121 | int ret; | 121 | int ret; |
| 122 | 122 | ||
| 123 | ret = __get_user(ptr, &t->rseq->rseq_cs); | 123 | if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) |
| 124 | if (ret) | 124 | return -EFAULT; |
| 125 | return ret; | ||
| 126 | if (!ptr) { | 125 | if (!ptr) { |
| 127 | memset(rseq_cs, 0, sizeof(*rseq_cs)); | 126 | memset(rseq_cs, 0, sizeof(*rseq_cs)); |
| 128 | return 0; | 127 | return 0; |
| 129 | } | 128 | } |
| 130 | urseq_cs = (struct rseq_cs __user *)ptr; | 129 | if (ptr >= TASK_SIZE) |
| 130 | return -EINVAL; | ||
| 131 | urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; | ||
| 131 | if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) | 132 | if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) |
| 132 | return -EFAULT; | 133 | return -EFAULT; |
| 133 | if (rseq_cs->version > 0) | ||
| 134 | return -EINVAL; | ||
| 135 | 134 | ||
| 135 | if (rseq_cs->start_ip >= TASK_SIZE || | ||
| 136 | rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || | ||
| 137 | rseq_cs->abort_ip >= TASK_SIZE || | ||
| 138 | rseq_cs->version > 0) | ||
| 139 | return -EINVAL; | ||
| 140 | /* Check for overflow. */ | ||
| 141 | if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) | ||
| 142 | return -EINVAL; | ||
| 136 | /* Ensure that abort_ip is not in the critical section. */ | 143 | /* Ensure that abort_ip is not in the critical section. */ |
| 137 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) | 144 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) |
| 138 | return -EINVAL; | 145 | return -EINVAL; |
| 139 | 146 | ||
| 140 | usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); | 147 | usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); |
| 141 | ret = get_user(sig, usig); | 148 | ret = get_user(sig, usig); |
| 142 | if (ret) | 149 | if (ret) |
| 143 | return ret; | 150 | return ret; |
| @@ -146,7 +153,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) | |||
| 146 | printk_ratelimited(KERN_WARNING | 153 | printk_ratelimited(KERN_WARNING |
| 147 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", | 154 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", |
| 148 | sig, current->rseq_sig, current->pid, usig); | 155 | sig, current->rseq_sig, current->pid, usig); |
| 149 | return -EPERM; | 156 | return -EINVAL; |
| 150 | } | 157 | } |
| 151 | return 0; | 158 | return 0; |
| 152 | } | 159 | } |
| @@ -157,7 +164,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) | |||
| 157 | int ret; | 164 | int ret; |
| 158 | 165 | ||
| 159 | /* Get thread flags. */ | 166 | /* Get thread flags. */ |
| 160 | ret = __get_user(flags, &t->rseq->flags); | 167 | ret = get_user(flags, &t->rseq->flags); |
| 161 | if (ret) | 168 | if (ret) |
| 162 | return ret; | 169 | return ret; |
| 163 | 170 | ||
| @@ -195,9 +202,11 @@ static int clear_rseq_cs(struct task_struct *t) | |||
| 195 | * of code outside of the rseq assembly block. This performs | 202 | * of code outside of the rseq assembly block. This performs |
| 196 | * a lazy clear of the rseq_cs field. | 203 | * a lazy clear of the rseq_cs field. |
| 197 | * | 204 | * |
| 198 | * Set rseq_cs to NULL with single-copy atomicity. | 205 | * Set rseq_cs to NULL. |
| 199 | */ | 206 | */ |
| 200 | return __put_user(0UL, &t->rseq->rseq_cs); | 207 | if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) |
| 208 | return -EFAULT; | ||
| 209 | return 0; | ||
| 201 | } | 210 | } |
| 202 | 211 | ||
| 203 | /* | 212 | /* |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -7,7 +7,6 @@ | |||
| 7 | */ | 7 | */ |
| 8 | #include "sched.h" | 8 | #include "sched.h" |
| 9 | 9 | ||
| 10 | #include <linux/kthread.h> | ||
| 11 | #include <linux/nospec.h> | 10 | #include <linux/nospec.h> |
| 12 | 11 | ||
| 13 | #include <linux/kcov.h> | 12 | #include <linux/kcov.h> |
| @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
| 2724 | membarrier_mm_sync_core_before_usermode(mm); | 2723 | membarrier_mm_sync_core_before_usermode(mm); |
| 2725 | mmdrop(mm); | 2724 | mmdrop(mm); |
| 2726 | } | 2725 | } |
| 2727 | if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { | 2726 | if (unlikely(prev_state == TASK_DEAD)) { |
| 2728 | switch (prev_state) { | 2727 | if (prev->sched_class->task_dead) |
| 2729 | case TASK_DEAD: | 2728 | prev->sched_class->task_dead(prev); |
| 2730 | if (prev->sched_class->task_dead) | ||
| 2731 | prev->sched_class->task_dead(prev); | ||
| 2732 | 2729 | ||
| 2733 | /* | 2730 | /* |
| 2734 | * Remove function-return probe instances associated with this | 2731 | * Remove function-return probe instances associated with this |
| 2735 | * task and put them back on the free list. | 2732 | * task and put them back on the free list. |
| 2736 | */ | 2733 | */ |
| 2737 | kprobe_flush_task(prev); | 2734 | kprobe_flush_task(prev); |
| 2738 | |||
| 2739 | /* Task is done with its stack. */ | ||
| 2740 | put_task_stack(prev); | ||
| 2741 | 2735 | ||
| 2742 | put_task_struct(prev); | 2736 | /* Task is done with its stack. */ |
| 2743 | break; | 2737 | put_task_stack(prev); |
| 2744 | 2738 | ||
| 2745 | case TASK_PARKED: | 2739 | put_task_struct(prev); |
| 2746 | kthread_park_complete(prev); | ||
| 2747 | break; | ||
| 2748 | } | ||
| 2749 | } | 2740 | } |
| 2750 | 2741 | ||
| 2751 | tick_nohz_task_switch(); | 2742 | tick_nohz_task_switch(); |
| @@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work) | |||
| 3113 | struct tick_work *twork = container_of(dwork, struct tick_work, work); | 3104 | struct tick_work *twork = container_of(dwork, struct tick_work, work); |
| 3114 | int cpu = twork->cpu; | 3105 | int cpu = twork->cpu; |
| 3115 | struct rq *rq = cpu_rq(cpu); | 3106 | struct rq *rq = cpu_rq(cpu); |
| 3107 | struct task_struct *curr; | ||
| 3116 | struct rq_flags rf; | 3108 | struct rq_flags rf; |
| 3109 | u64 delta; | ||
| 3117 | 3110 | ||
| 3118 | /* | 3111 | /* |
| 3119 | * Handle the tick only if it appears the remote CPU is running in full | 3112 | * Handle the tick only if it appears the remote CPU is running in full |
| @@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work) | |||
| 3122 | * statistics and checks timeslices in a time-independent way, regardless | 3115 | * statistics and checks timeslices in a time-independent way, regardless |
| 3123 | * of when exactly it is running. | 3116 | * of when exactly it is running. |
| 3124 | */ | 3117 | */ |
| 3125 | if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { | 3118 | if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) |
| 3126 | struct task_struct *curr; | 3119 | goto out_requeue; |
| 3127 | u64 delta; | ||
| 3128 | 3120 | ||
| 3129 | rq_lock_irq(rq, &rf); | 3121 | rq_lock_irq(rq, &rf); |
| 3130 | update_rq_clock(rq); | 3122 | curr = rq->curr; |
| 3131 | curr = rq->curr; | 3123 | if (is_idle_task(curr)) |
| 3132 | delta = rq_clock_task(rq) - curr->se.exec_start; | 3124 | goto out_unlock; |
| 3133 | 3125 | ||
| 3134 | /* | 3126 | update_rq_clock(rq); |
| 3135 | * Make sure the next tick runs within a reasonable | 3127 | delta = rq_clock_task(rq) - curr->se.exec_start; |
| 3136 | * amount of time. | 3128 | |
| 3137 | */ | 3129 | /* |
| 3138 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); | 3130 | * Make sure the next tick runs within a reasonable |
| 3139 | curr->sched_class->task_tick(rq, curr, 0); | 3131 | * amount of time. |
| 3140 | rq_unlock_irq(rq, &rf); | 3132 | */ |
| 3141 | } | 3133 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); |
| 3134 | curr->sched_class->task_tick(rq, curr, 0); | ||
| 3135 | |||
| 3136 | out_unlock: | ||
| 3137 | rq_unlock_irq(rq, &rf); | ||
| 3142 | 3138 | ||
| 3139 | out_requeue: | ||
| 3143 | /* | 3140 | /* |
| 3144 | * Run the remote tick once per second (1Hz). This arbitrary | 3141 | * Run the remote tick once per second (1Hz). This arbitrary |
| 3145 | * frequency is large enough to avoid overload but short enough | 3142 | * frequency is large enough to avoid overload but short enough |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..c907fde01eaa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | |||
| 192 | { | 192 | { |
| 193 | struct rq *rq = cpu_rq(sg_cpu->cpu); | 193 | struct rq *rq = cpu_rq(sg_cpu->cpu); |
| 194 | 194 | ||
| 195 | if (rq->rt.rt_nr_running) | 195 | if (rt_rq_is_runnable(&rq->rt)) |
| 196 | return sg_cpu->max; | 196 | return sg_cpu->max; |
| 197 | 197 | ||
| 198 | /* | 198 | /* |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..2f0a0be4d344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | |||
| 3982 | if (!sched_feat(UTIL_EST)) | 3982 | if (!sched_feat(UTIL_EST)) |
| 3983 | return; | 3983 | return; |
| 3984 | 3984 | ||
| 3985 | /* | 3985 | /* Update root cfs_rq's estimated utilization */ |
| 3986 | * Update root cfs_rq's estimated utilization | 3986 | ue.enqueued = cfs_rq->avg.util_est.enqueued; |
| 3987 | * | 3987 | ue.enqueued -= min_t(unsigned int, ue.enqueued, |
| 3988 | * If *p is the last task then the root cfs_rq's estimated utilization | 3988 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); |
| 3989 | * of a CPU is 0 by definition. | ||
| 3990 | */ | ||
| 3991 | ue.enqueued = 0; | ||
| 3992 | if (cfs_rq->nr_running) { | ||
| 3993 | ue.enqueued = cfs_rq->avg.util_est.enqueued; | ||
| 3994 | ue.enqueued -= min_t(unsigned int, ue.enqueued, | ||
| 3995 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); | ||
| 3996 | } | ||
| 3997 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); | 3989 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); |
| 3998 | 3990 | ||
| 3999 | /* | 3991 | /* |
| @@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | |||
| 4590 | now = sched_clock_cpu(smp_processor_id()); | 4582 | now = sched_clock_cpu(smp_processor_id()); |
| 4591 | cfs_b->runtime = cfs_b->quota; | 4583 | cfs_b->runtime = cfs_b->quota; |
| 4592 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | 4584 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); |
| 4585 | cfs_b->expires_seq++; | ||
| 4593 | } | 4586 | } |
| 4594 | 4587 | ||
| 4595 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | 4588 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) |
| @@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 4612 | struct task_group *tg = cfs_rq->tg; | 4605 | struct task_group *tg = cfs_rq->tg; |
| 4613 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 4606 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); |
| 4614 | u64 amount = 0, min_amount, expires; | 4607 | u64 amount = 0, min_amount, expires; |
| 4608 | int expires_seq; | ||
| 4615 | 4609 | ||
| 4616 | /* note: this is a positive sum as runtime_remaining <= 0 */ | 4610 | /* note: this is a positive sum as runtime_remaining <= 0 */ |
| 4617 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | 4611 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; |
| @@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 4628 | cfs_b->idle = 0; | 4622 | cfs_b->idle = 0; |
| 4629 | } | 4623 | } |
| 4630 | } | 4624 | } |
| 4625 | expires_seq = cfs_b->expires_seq; | ||
| 4631 | expires = cfs_b->runtime_expires; | 4626 | expires = cfs_b->runtime_expires; |
| 4632 | raw_spin_unlock(&cfs_b->lock); | 4627 | raw_spin_unlock(&cfs_b->lock); |
| 4633 | 4628 | ||
| @@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 4637 | * spread between our sched_clock and the one on which runtime was | 4632 | * spread between our sched_clock and the one on which runtime was |
| 4638 | * issued. | 4633 | * issued. |
| 4639 | */ | 4634 | */ |
| 4640 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | 4635 | if (cfs_rq->expires_seq != expires_seq) { |
| 4636 | cfs_rq->expires_seq = expires_seq; | ||
| 4641 | cfs_rq->runtime_expires = expires; | 4637 | cfs_rq->runtime_expires = expires; |
| 4638 | } | ||
| 4642 | 4639 | ||
| 4643 | return cfs_rq->runtime_remaining > 0; | 4640 | return cfs_rq->runtime_remaining > 0; |
| 4644 | } | 4641 | } |
| @@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 4664 | * has not truly expired. | 4661 | * has not truly expired. |
| 4665 | * | 4662 | * |
| 4666 | * Fortunately we can check determine whether this the case by checking | 4663 | * Fortunately we can check determine whether this the case by checking |
| 4667 | * whether the global deadline has advanced. It is valid to compare | 4664 | * whether the global deadline(cfs_b->expires_seq) has advanced. |
| 4668 | * cfs_b->runtime_expires without any locks since we only care about | ||
| 4669 | * exact equality, so a partial write will still work. | ||
| 4670 | */ | 4665 | */ |
| 4671 | 4666 | if (cfs_rq->expires_seq == cfs_b->expires_seq) { | |
| 4672 | if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { | ||
| 4673 | /* extend local deadline, drift is bounded above by 2 ticks */ | 4667 | /* extend local deadline, drift is bounded above by 2 ticks */ |
| 4674 | cfs_rq->runtime_expires += TICK_NSEC; | 4668 | cfs_rq->runtime_expires += TICK_NSEC; |
| 4675 | } else { | 4669 | } else { |
| @@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 5202 | 5196 | ||
| 5203 | void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 5197 | void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
| 5204 | { | 5198 | { |
| 5199 | u64 overrun; | ||
| 5200 | |||
| 5205 | lockdep_assert_held(&cfs_b->lock); | 5201 | lockdep_assert_held(&cfs_b->lock); |
| 5206 | 5202 | ||
| 5207 | if (!cfs_b->period_active) { | 5203 | if (cfs_b->period_active) |
| 5208 | cfs_b->period_active = 1; | 5204 | return; |
| 5209 | hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); | 5205 | |
| 5210 | hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); | 5206 | cfs_b->period_active = 1; |
| 5211 | } | 5207 | overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); |
| 5208 | cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); | ||
| 5209 | cfs_b->expires_seq++; | ||
| 5210 | hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); | ||
| 5212 | } | 5211 | } |
| 5213 | 5212 | ||
| 5214 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 5213 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..572567078b60 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
| 508 | 508 | ||
| 509 | rt_se = rt_rq->tg->rt_se[cpu]; | 509 | rt_se = rt_rq->tg->rt_se[cpu]; |
| 510 | 510 | ||
| 511 | if (!rt_se) | 511 | if (!rt_se) { |
| 512 | dequeue_top_rt_rq(rt_rq); | 512 | dequeue_top_rt_rq(rt_rq); |
| 513 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
| 514 | cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); | ||
| 515 | } | ||
| 513 | else if (on_rt_rq(rt_se)) | 516 | else if (on_rt_rq(rt_se)) |
| 514 | dequeue_rt_entity(rt_se, 0); | 517 | dequeue_rt_entity(rt_se, 0); |
| 515 | } | 518 | } |
| @@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) | |||
| 1001 | sub_nr_running(rq, rt_rq->rt_nr_running); | 1004 | sub_nr_running(rq, rt_rq->rt_nr_running); |
| 1002 | rt_rq->rt_queued = 0; | 1005 | rt_rq->rt_queued = 0; |
| 1003 | 1006 | ||
| 1004 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
| 1005 | cpufreq_update_util(rq, 0); | ||
| 1006 | } | 1007 | } |
| 1007 | 1008 | ||
| 1008 | static void | 1009 | static void |
| @@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) | |||
| 1014 | 1015 | ||
| 1015 | if (rt_rq->rt_queued) | 1016 | if (rt_rq->rt_queued) |
| 1016 | return; | 1017 | return; |
| 1017 | if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) | 1018 | |
| 1019 | if (rt_rq_throttled(rt_rq)) | ||
| 1018 | return; | 1020 | return; |
| 1019 | 1021 | ||
| 1020 | add_nr_running(rq, rt_rq->rt_nr_running); | 1022 | if (rt_rq->rt_nr_running) { |
| 1021 | rt_rq->rt_queued = 1; | 1023 | add_nr_running(rq, rt_rq->rt_nr_running); |
| 1024 | rt_rq->rt_queued = 1; | ||
| 1025 | } | ||
| 1022 | 1026 | ||
| 1023 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | 1027 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ |
| 1024 | cpufreq_update_util(rq, 0); | 1028 | cpufreq_update_util(rq, 0); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..c7742dcc136c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -334,9 +334,10 @@ struct cfs_bandwidth { | |||
| 334 | u64 runtime; | 334 | u64 runtime; |
| 335 | s64 hierarchical_quota; | 335 | s64 hierarchical_quota; |
| 336 | u64 runtime_expires; | 336 | u64 runtime_expires; |
| 337 | int expires_seq; | ||
| 337 | 338 | ||
| 338 | int idle; | 339 | short idle; |
| 339 | int period_active; | 340 | short period_active; |
| 340 | struct hrtimer period_timer; | 341 | struct hrtimer period_timer; |
| 341 | struct hrtimer slack_timer; | 342 | struct hrtimer slack_timer; |
| 342 | struct list_head throttled_cfs_rq; | 343 | struct list_head throttled_cfs_rq; |
| @@ -551,6 +552,7 @@ struct cfs_rq { | |||
| 551 | 552 | ||
| 552 | #ifdef CONFIG_CFS_BANDWIDTH | 553 | #ifdef CONFIG_CFS_BANDWIDTH |
| 553 | int runtime_enabled; | 554 | int runtime_enabled; |
| 555 | int expires_seq; | ||
| 554 | u64 runtime_expires; | 556 | u64 runtime_expires; |
| 555 | s64 runtime_remaining; | 557 | s64 runtime_remaining; |
| 556 | 558 | ||
| @@ -609,6 +611,11 @@ struct rt_rq { | |||
| 609 | #endif | 611 | #endif |
| 610 | }; | 612 | }; |
| 611 | 613 | ||
| 614 | static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) | ||
| 615 | { | ||
| 616 | return rt_rq->rt_queued && rt_rq->rt_nr_running; | ||
| 617 | } | ||
| 618 | |||
| 612 | /* Deadline class' related fields in a runqueue */ | 619 | /* Deadline class' related fields in a runqueue */ |
| 613 | struct dl_rq { | 620 | struct dl_rq { |
| 614 | /* runqueue is an rbtree, ordered by deadline */ | 621 | /* runqueue is an rbtree, ordered by deadline */ |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 900dcfee542c..75ffc1d1a2e0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -79,12 +79,16 @@ static void wakeup_softirqd(void) | |||
| 79 | 79 | ||
| 80 | /* | 80 | /* |
| 81 | * If ksoftirqd is scheduled, we do not want to process pending softirqs | 81 | * If ksoftirqd is scheduled, we do not want to process pending softirqs |
| 82 | * right now. Let ksoftirqd handle this at its own rate, to get fairness. | 82 | * right now. Let ksoftirqd handle this at its own rate, to get fairness, |
| 83 | * unless we're doing some of the synchronous softirqs. | ||
| 83 | */ | 84 | */ |
| 84 | static bool ksoftirqd_running(void) | 85 | #define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) |
| 86 | static bool ksoftirqd_running(unsigned long pending) | ||
| 85 | { | 87 | { |
| 86 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); | 88 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
| 87 | 89 | ||
| 90 | if (pending & SOFTIRQ_NOW_MASK) | ||
| 91 | return false; | ||
| 88 | return tsk && (tsk->state == TASK_RUNNING); | 92 | return tsk && (tsk->state == TASK_RUNNING); |
| 89 | } | 93 | } |
| 90 | 94 | ||
| @@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void) | |||
| 328 | 332 | ||
| 329 | pending = local_softirq_pending(); | 333 | pending = local_softirq_pending(); |
| 330 | 334 | ||
| 331 | if (pending && !ksoftirqd_running()) | 335 | if (pending && !ksoftirqd_running(pending)) |
| 332 | do_softirq_own_stack(); | 336 | do_softirq_own_stack(); |
| 333 | 337 | ||
| 334 | local_irq_restore(flags); | 338 | local_irq_restore(flags); |
| @@ -355,7 +359,7 @@ void irq_enter(void) | |||
| 355 | 359 | ||
| 356 | static inline void invoke_softirq(void) | 360 | static inline void invoke_softirq(void) |
| 357 | { | 361 | { |
| 358 | if (ksoftirqd_running()) | 362 | if (ksoftirqd_running(local_softirq_pending())) |
| 359 | return; | 363 | return; |
| 360 | 364 | ||
| 361 | if (!force_irqthreads) { | 365 | if (!force_irqthreads) { |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7005dd21ec1..14de3727b18e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, | |||
| 277 | */ | 277 | */ |
| 278 | return !curdev || | 278 | return !curdev || |
| 279 | newdev->rating > curdev->rating || | 279 | newdev->rating > curdev->rating || |
| 280 | (!cpumask_equal(curdev->cpumask, newdev->cpumask) && | 280 | !cpumask_equal(curdev->cpumask, newdev->cpumask); |
| 281 | !tick_check_percpu(curdev, newdev, smp_processor_id())); | ||
| 282 | } | 281 | } |
| 283 | 282 | ||
| 284 | /* | 283 | /* |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..caf9cbf35816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | |||
| 192 | op->saved_func(ip, parent_ip, op, regs); | 192 | op->saved_func(ip, parent_ip, op, regs); |
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | /** | ||
| 196 | * clear_ftrace_function - reset the ftrace function | ||
| 197 | * | ||
| 198 | * This NULLs the ftrace function and in essence stops | ||
| 199 | * tracing. There may be lag | ||
| 200 | */ | ||
| 201 | void clear_ftrace_function(void) | ||
| 202 | { | ||
| 203 | ftrace_trace_function = ftrace_stub; | ||
| 204 | } | ||
| 205 | |||
| 206 | static void ftrace_sync(struct work_struct *work) | 195 | static void ftrace_sync(struct work_struct *work) |
| 207 | { | 196 | { |
| 208 | /* | 197 | /* |
| @@ -6689,7 +6678,7 @@ void ftrace_kill(void) | |||
| 6689 | { | 6678 | { |
| 6690 | ftrace_disabled = 1; | 6679 | ftrace_disabled = 1; |
| 6691 | ftrace_enabled = 0; | 6680 | ftrace_enabled = 0; |
| 6692 | clear_ftrace_function(); | 6681 | ftrace_trace_function = ftrace_stub; |
| 6693 | } | 6682 | } |
| 6694 | 6683 | ||
| 6695 | /** | 6684 | /** |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0079b4c7a49..87cf25171fb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -2953,6 +2953,7 @@ out_nobuffer: | |||
| 2953 | } | 2953 | } |
| 2954 | EXPORT_SYMBOL_GPL(trace_vbprintk); | 2954 | EXPORT_SYMBOL_GPL(trace_vbprintk); |
| 2955 | 2955 | ||
| 2956 | __printf(3, 0) | ||
| 2956 | static int | 2957 | static int |
| 2957 | __trace_array_vprintk(struct ring_buffer *buffer, | 2958 | __trace_array_vprintk(struct ring_buffer *buffer, |
| 2958 | unsigned long ip, const char *fmt, va_list args) | 2959 | unsigned long ip, const char *fmt, va_list args) |
| @@ -3007,12 +3008,14 @@ out_nobuffer: | |||
| 3007 | return len; | 3008 | return len; |
| 3008 | } | 3009 | } |
| 3009 | 3010 | ||
| 3011 | __printf(3, 0) | ||
| 3010 | int trace_array_vprintk(struct trace_array *tr, | 3012 | int trace_array_vprintk(struct trace_array *tr, |
| 3011 | unsigned long ip, const char *fmt, va_list args) | 3013 | unsigned long ip, const char *fmt, va_list args) |
| 3012 | { | 3014 | { |
| 3013 | return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); | 3015 | return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); |
| 3014 | } | 3016 | } |
| 3015 | 3017 | ||
| 3018 | __printf(3, 0) | ||
| 3016 | int trace_array_printk(struct trace_array *tr, | 3019 | int trace_array_printk(struct trace_array *tr, |
| 3017 | unsigned long ip, const char *fmt, ...) | 3020 | unsigned long ip, const char *fmt, ...) |
| 3018 | { | 3021 | { |
| @@ -3028,6 +3031,7 @@ int trace_array_printk(struct trace_array *tr, | |||
| 3028 | return ret; | 3031 | return ret; |
| 3029 | } | 3032 | } |
| 3030 | 3033 | ||
| 3034 | __printf(3, 4) | ||
| 3031 | int trace_array_printk_buf(struct ring_buffer *buffer, | 3035 | int trace_array_printk_buf(struct ring_buffer *buffer, |
| 3032 | unsigned long ip, const char *fmt, ...) | 3036 | unsigned long ip, const char *fmt, ...) |
| 3033 | { | 3037 | { |
| @@ -3043,6 +3047,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, | |||
| 3043 | return ret; | 3047 | return ret; |
| 3044 | } | 3048 | } |
| 3045 | 3049 | ||
| 3050 | __printf(2, 0) | ||
| 3046 | int trace_vprintk(unsigned long ip, const char *fmt, va_list args) | 3051 | int trace_vprintk(unsigned long ip, const char *fmt, va_list args) |
| 3047 | { | 3052 | { |
| 3048 | return trace_array_vprintk(&global_trace, ip, fmt, args); | 3053 | return trace_array_vprintk(&global_trace, ip, fmt, args); |
| @@ -3360,8 +3365,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, | |||
| 3360 | 3365 | ||
| 3361 | print_event_info(buf, m); | 3366 | print_event_info(buf, m); |
| 3362 | 3367 | ||
| 3363 | seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); | 3368 | seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); |
| 3364 | seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); | 3369 | seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); |
| 3365 | } | 3370 | } |
| 3366 | 3371 | ||
| 3367 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, | 3372 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, |
| @@ -3381,9 +3386,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file | |||
| 3381 | tgid ? tgid_space : space); | 3386 | tgid ? tgid_space : space); |
| 3382 | seq_printf(m, "# %s||| / delay\n", | 3387 | seq_printf(m, "# %s||| / delay\n", |
| 3383 | tgid ? tgid_space : space); | 3388 | tgid ? tgid_space : space); |
| 3384 | seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", | 3389 | seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", |
| 3385 | tgid ? " TGID " : space); | 3390 | tgid ? " TGID " : space); |
| 3386 | seq_printf(m, "# | | | %s|||| | |\n", | 3391 | seq_printf(m, "# | | %s | |||| | |\n", |
| 3387 | tgid ? " | " : space); | 3392 | tgid ? " | " : space); |
| 3388 | } | 3393 | } |
| 3389 | 3394 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 630c5a24b2b2..f8f86231ad90 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit) | |||
| 583 | static inline struct ring_buffer_iter * | 583 | static inline struct ring_buffer_iter * |
| 584 | trace_buffer_iter(struct trace_iterator *iter, int cpu) | 584 | trace_buffer_iter(struct trace_iterator *iter, int cpu) |
| 585 | { | 585 | { |
| 586 | if (iter->buffer_iter && iter->buffer_iter[cpu]) | 586 | return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; |
| 587 | return iter->buffer_iter[cpu]; | ||
| 588 | return NULL; | ||
| 589 | } | 587 | } |
| 590 | 588 | ||
| 591 | int tracer_init(struct tracer *t, struct trace_array *tr); | 589 | int tracer_init(struct tracer *t, struct trace_array *tr); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0dceb77d1d42..893a206bcba4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe) | |||
| 1701 | * @filter_str: filter string | 1701 | * @filter_str: filter string |
| 1702 | * @set_str: remember @filter_str and enable detailed error in filter | 1702 | * @set_str: remember @filter_str and enable detailed error in filter |
| 1703 | * @filterp: out param for created filter (always updated on return) | 1703 | * @filterp: out param for created filter (always updated on return) |
| 1704 | * Must be a pointer that references a NULL pointer. | ||
| 1704 | * | 1705 | * |
| 1705 | * Creates a filter for @call with @filter_str. If @set_str is %true, | 1706 | * Creates a filter for @call with @filter_str. If @set_str is %true, |
| 1706 | * @filter_str is copied and recorded in the new filter. | 1707 | * @filter_str is copied and recorded in the new filter. |
| @@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call, | |||
| 1718 | struct filter_parse_error *pe = NULL; | 1719 | struct filter_parse_error *pe = NULL; |
| 1719 | int err; | 1720 | int err; |
| 1720 | 1721 | ||
| 1722 | /* filterp must point to NULL */ | ||
| 1723 | if (WARN_ON(*filterp)) | ||
| 1724 | *filterp = NULL; | ||
| 1725 | |||
| 1721 | err = create_filter_start(filter_string, set_str, &pe, filterp); | 1726 | err = create_filter_start(filter_string, set_str, &pe, filterp); |
| 1722 | if (err) | 1727 | if (err) |
| 1723 | return err; | 1728 | return err; |
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 046c716a6536..aae18af94c94 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c | |||
| @@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var) | |||
| 393 | else if (system) | 393 | else if (system) |
| 394 | snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); | 394 | snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); |
| 395 | else | 395 | else |
| 396 | strncpy(err, var, MAX_FILTER_STR_VAL); | 396 | strscpy(err, var, MAX_FILTER_STR_VAL); |
| 397 | 397 | ||
| 398 | hist_err(str, err); | 398 | hist_err(str, err); |
| 399 | } | 399 | } |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23c0b0cb5fb9..169b3c44ee97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 831 | struct ftrace_graph_ret *graph_ret; | 831 | struct ftrace_graph_ret *graph_ret; |
| 832 | struct ftrace_graph_ent *call; | 832 | struct ftrace_graph_ent *call; |
| 833 | unsigned long long duration; | 833 | unsigned long long duration; |
| 834 | int cpu = iter->cpu; | ||
| 834 | int i; | 835 | int i; |
| 835 | 836 | ||
| 836 | graph_ret = &ret_entry->ret; | 837 | graph_ret = &ret_entry->ret; |
| @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 839 | 840 | ||
| 840 | if (data) { | 841 | if (data) { |
| 841 | struct fgraph_cpu_data *cpu_data; | 842 | struct fgraph_cpu_data *cpu_data; |
| 842 | int cpu = iter->cpu; | ||
| 843 | 843 | ||
| 844 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); | 844 | cpu_data = per_cpu_ptr(data->cpu_data, cpu); |
| 845 | 845 | ||
| @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 869 | 869 | ||
| 870 | trace_seq_printf(s, "%ps();\n", (void *)call->func); | 870 | trace_seq_printf(s, "%ps();\n", (void *)call->func); |
| 871 | 871 | ||
| 872 | print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, | ||
| 873 | cpu, iter->ent->pid, flags); | ||
| 874 | |||
| 872 | return trace_handle_return(s); | 875 | return trace_handle_return(s); |
| 873 | } | 876 | } |
| 874 | 877 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..21f718472942 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -1480,8 +1480,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, | |||
| 1480 | } | 1480 | } |
| 1481 | 1481 | ||
| 1482 | ret = __register_trace_kprobe(tk); | 1482 | ret = __register_trace_kprobe(tk); |
| 1483 | if (ret < 0) | 1483 | if (ret < 0) { |
| 1484 | kfree(tk->tp.call.print_fmt); | ||
| 1484 | goto error; | 1485 | goto error; |
| 1486 | } | ||
| 1485 | 1487 | ||
| 1486 | return &tk->tp.call; | 1488 | return &tk->tp.call; |
| 1487 | error: | 1489 | error: |
| @@ -1501,6 +1503,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) | |||
| 1501 | } | 1503 | } |
| 1502 | 1504 | ||
| 1503 | __unregister_trace_kprobe(tk); | 1505 | __unregister_trace_kprobe(tk); |
| 1506 | |||
| 1507 | kfree(tk->tp.call.print_fmt); | ||
| 1504 | free_trace_kprobe(tk); | 1508 | free_trace_kprobe(tk); |
| 1505 | } | 1509 | } |
| 1506 | #endif /* CONFIG_PERF_EVENTS */ | 1510 | #endif /* CONFIG_PERF_EVENTS */ |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 90db994ac900..1c8e30fda46a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 594 | 594 | ||
| 595 | trace_find_cmdline(entry->pid, comm); | 595 | trace_find_cmdline(entry->pid, comm); |
| 596 | 596 | ||
| 597 | trace_seq_printf(s, "%16s-%-5d [%03d] ", | 597 | trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); |
| 598 | comm, entry->pid, iter->cpu); | ||
| 599 | 598 | ||
| 600 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { | 599 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { |
| 601 | unsigned int tgid = trace_find_tgid(entry->pid); | 600 | unsigned int tgid = trace_find_tgid(entry->pid); |
| @@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 606 | trace_seq_printf(s, "(%5d) ", tgid); | 605 | trace_seq_printf(s, "(%5d) ", tgid); |
| 607 | } | 606 | } |
| 608 | 607 | ||
| 608 | trace_seq_printf(s, "[%03d] ", iter->cpu); | ||
| 609 | |||
| 609 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) | 610 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) |
| 610 | trace_print_lat_fmt(s, entry); | 611 | trace_print_lat_fmt(s, entry); |
| 611 | 612 | ||
