diff options
author | Stanislav Fomichev <sdf@google.com> | 2019-08-14 13:37:49 -0400 |
---|---|---|
committer | Daniel Borkmann <daniel@iogearbox.net> | 2019-08-17 17:18:54 -0400 |
commit | 8f51dfc73bf181f2304e1498f55d5f452e060cbe (patch) | |
tree | 1b3877adff06cd12dbeac725521cef44d2ebf7d7 /net/core | |
parent | b0e4701ce15d0381cdea0643c7f0a35dc529cec2 (diff) |
bpf: support cloning sk storage on accept()
Add new helper bpf_sk_storage_clone which optionally clones sk storage
and call it from sk_clone_lock.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/bpf_sk_storage.c | 104 | ||||
-rw-r--r-- | net/core/sock.c | 9 |
2 files changed, 107 insertions, 6 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 94c7f77ecb6b..da5639a5bd3b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c | |||
@@ -12,6 +12,9 @@ | |||
12 | 12 | ||
13 | static atomic_t cache_idx; | 13 | static atomic_t cache_idx; |
14 | 14 | ||
15 | #define SK_STORAGE_CREATE_FLAG_MASK \ | ||
16 | (BPF_F_NO_PREALLOC | BPF_F_CLONE) | ||
17 | |||
15 | struct bucket { | 18 | struct bucket { |
16 | struct hlist_head list; | 19 | struct hlist_head list; |
17 | raw_spinlock_t lock; | 20 | raw_spinlock_t lock; |
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) | |||
209 | kfree_rcu(sk_storage, rcu); | 212 | kfree_rcu(sk_storage, rcu); |
210 | } | 213 | } |
211 | 214 | ||
212 | /* sk_storage->lock must be held and sk_storage->list cannot be empty */ | ||
213 | static void __selem_link_sk(struct bpf_sk_storage *sk_storage, | 215 | static void __selem_link_sk(struct bpf_sk_storage *sk_storage, |
214 | struct bpf_sk_storage_elem *selem) | 216 | struct bpf_sk_storage_elem *selem) |
215 | { | 217 | { |
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) | |||
509 | return 0; | 511 | return 0; |
510 | } | 512 | } |
511 | 513 | ||
512 | /* Called by __sk_destruct() */ | 514 | /* Called by __sk_destruct() & bpf_sk_storage_clone() */ |
513 | void bpf_sk_storage_free(struct sock *sk) | 515 | void bpf_sk_storage_free(struct sock *sk) |
514 | { | 516 | { |
515 | struct bpf_sk_storage_elem *selem; | 517 | struct bpf_sk_storage_elem *selem; |
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) | |||
557 | 559 | ||
558 | smap = (struct bpf_sk_storage_map *)map; | 560 | smap = (struct bpf_sk_storage_map *)map; |
559 | 561 | ||
562 | /* Note that this map might be concurrently cloned from | ||
563 | * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone | ||
564 | * RCU read section to finish before proceeding. New RCU | ||
565 | * read sections should be prevented via bpf_map_inc_not_zero. | ||
566 | */ | ||
560 | synchronize_rcu(); | 567 | synchronize_rcu(); |
561 | 568 | ||
562 | /* bpf prog and the userspace can no longer access this map | 569 | /* bpf prog and the userspace can no longer access this map |
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) | |||
601 | 608 | ||
602 | static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) | 609 | static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) |
603 | { | 610 | { |
604 | if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || | 611 | if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || |
612 | !(attr->map_flags & BPF_F_NO_PREALLOC) || | ||
613 | attr->max_entries || | ||
605 | attr->key_size != sizeof(int) || !attr->value_size || | 614 | attr->key_size != sizeof(int) || !attr->value_size || |
606 | /* Enforce BTF for userspace sk dumping */ | 615 | /* Enforce BTF for userspace sk dumping */ |
607 | !attr->btf_key_type_id || !attr->btf_value_type_id) | 616 | !attr->btf_key_type_id || !attr->btf_value_type_id) |
@@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) | |||
739 | return err; | 748 | return err; |
740 | } | 749 | } |
741 | 750 | ||
751 | static struct bpf_sk_storage_elem * | ||
752 | bpf_sk_storage_clone_elem(struct sock *newsk, | ||
753 | struct bpf_sk_storage_map *smap, | ||
754 | struct bpf_sk_storage_elem *selem) | ||
755 | { | ||
756 | struct bpf_sk_storage_elem *copy_selem; | ||
757 | |||
758 | copy_selem = selem_alloc(smap, newsk, NULL, true); | ||
759 | if (!copy_selem) | ||
760 | return NULL; | ||
761 | |||
762 | if (map_value_has_spin_lock(&smap->map)) | ||
763 | copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, | ||
764 | SDATA(selem)->data, true); | ||
765 | else | ||
766 | copy_map_value(&smap->map, SDATA(copy_selem)->data, | ||
767 | SDATA(selem)->data); | ||
768 | |||
769 | return copy_selem; | ||
770 | } | ||
771 | |||
772 | int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) | ||
773 | { | ||
774 | struct bpf_sk_storage *new_sk_storage = NULL; | ||
775 | struct bpf_sk_storage *sk_storage; | ||
776 | struct bpf_sk_storage_elem *selem; | ||
777 | int ret = 0; | ||
778 | |||
779 | RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); | ||
780 | |||
781 | rcu_read_lock(); | ||
782 | sk_storage = rcu_dereference(sk->sk_bpf_storage); | ||
783 | |||
784 | if (!sk_storage || hlist_empty(&sk_storage->list)) | ||
785 | goto out; | ||
786 | |||
787 | hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { | ||
788 | struct bpf_sk_storage_elem *copy_selem; | ||
789 | struct bpf_sk_storage_map *smap; | ||
790 | struct bpf_map *map; | ||
791 | |||
792 | smap = rcu_dereference(SDATA(selem)->smap); | ||
793 | if (!(smap->map.map_flags & BPF_F_CLONE)) | ||
794 | continue; | ||
795 | |||
796 | /* Note that for lockless listeners adding new element | ||
797 | * here can race with cleanup in bpf_sk_storage_map_free. | ||
798 | * Try to grab map refcnt to make sure that it's still | ||
799 | * alive and prevent concurrent removal. | ||
800 | */ | ||
801 | map = bpf_map_inc_not_zero(&smap->map, false); | ||
802 | if (IS_ERR(map)) | ||
803 | continue; | ||
804 | |||
805 | copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); | ||
806 | if (!copy_selem) { | ||
807 | ret = -ENOMEM; | ||
808 | bpf_map_put(map); | ||
809 | goto out; | ||
810 | } | ||
811 | |||
812 | if (new_sk_storage) { | ||
813 | selem_link_map(smap, copy_selem); | ||
814 | __selem_link_sk(new_sk_storage, copy_selem); | ||
815 | } else { | ||
816 | ret = sk_storage_alloc(newsk, smap, copy_selem); | ||
817 | if (ret) { | ||
818 | kfree(copy_selem); | ||
819 | atomic_sub(smap->elem_size, | ||
820 | &newsk->sk_omem_alloc); | ||
821 | bpf_map_put(map); | ||
822 | goto out; | ||
823 | } | ||
824 | |||
825 | new_sk_storage = rcu_dereference(copy_selem->sk_storage); | ||
826 | } | ||
827 | bpf_map_put(map); | ||
828 | } | ||
829 | |||
830 | out: | ||
831 | rcu_read_unlock(); | ||
832 | |||
833 | /* In case of an error, don't free anything explicitly here, the | ||
834 | * caller is responsible to call bpf_sk_storage_free. | ||
835 | */ | ||
836 | |||
837 | return ret; | ||
838 | } | ||
839 | |||
742 | BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, | 840 | BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, |
743 | void *, value, u64, flags) | 841 | void *, value, u64, flags) |
744 | { | 842 | { |
diff --git a/net/core/sock.c b/net/core/sock.c index d57b0cc995a0..f5e801a9cea4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) | |||
1851 | goto out; | 1851 | goto out; |
1852 | } | 1852 | } |
1853 | RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); | 1853 | RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); |
1854 | #ifdef CONFIG_BPF_SYSCALL | 1854 | |
1855 | RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); | 1855 | if (bpf_sk_storage_clone(sk, newsk)) { |
1856 | #endif | 1856 | sk_free_unlock_clone(newsk); |
1857 | newsk = NULL; | ||
1858 | goto out; | ||
1859 | } | ||
1857 | 1860 | ||
1858 | newsk->sk_err = 0; | 1861 | newsk->sk_err = 0; |
1859 | newsk->sk_err_soft = 0; | 1862 | newsk->sk_err_soft = 0; |