diff options
-rw-r--r-- | include/linux/bpf.h | 2 | ||||
-rw-r--r-- | include/linux/bpf_types.h | 1 | ||||
-rw-r--r-- | include/net/bpf_sk_storage.h | 13 | ||||
-rw-r--r-- | include/net/sock.h | 5 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 44 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 3 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 27 | ||||
-rw-r--r-- | net/bpf/test_run.c | 2 | ||||
-rw-r--r-- | net/core/Makefile | 1 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 804 | ||||
-rw-r--r-- | net/core/filter.c | 12 | ||||
-rw-r--r-- | net/core/sock.c | 5 |
12 files changed, 914 insertions, 5 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd6341eabd74..9a21848fdb07 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h | |||
@@ -184,6 +184,7 @@ enum bpf_arg_type { | |||
184 | ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ | 184 | ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ |
185 | ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ | 185 | ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ |
186 | ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ | 186 | ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ |
187 | ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ | ||
187 | 188 | ||
188 | /* the following constraints used to prototype bpf_memcmp() and other | 189 | /* the following constraints used to prototype bpf_memcmp() and other |
189 | * functions that access data on eBPF program stack | 190 | * functions that access data on eBPF program stack |
@@ -204,6 +205,7 @@ enum bpf_arg_type { | |||
204 | ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ | 205 | ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ |
205 | ARG_PTR_TO_INT, /* pointer to int */ | 206 | ARG_PTR_TO_INT, /* pointer to int */ |
206 | ARG_PTR_TO_LONG, /* pointer to long */ | 207 | ARG_PTR_TO_LONG, /* pointer to long */ |
208 | ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ | ||
207 | }; | 209 | }; |
208 | 210 | ||
209 | /* type of values returned from helper functions */ | 211 | /* type of values returned from helper functions */ |
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a10d37bce364..5a9975678d6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h | |||
@@ -61,6 +61,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) | |||
61 | BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) | 61 | BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) |
62 | #ifdef CONFIG_NET | 62 | #ifdef CONFIG_NET |
63 | BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) | 63 | BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) |
64 | BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) | ||
64 | #if defined(CONFIG_BPF_STREAM_PARSER) | 65 | #if defined(CONFIG_BPF_STREAM_PARSER) |
65 | BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) | 66 | BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) |
66 | BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) | 67 | BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) |
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h new file mode 100644 index 000000000000..b9dcb02e756b --- /dev/null +++ b/include/net/bpf_sk_storage.h | |||
@@ -0,0 +1,13 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* Copyright (c) 2019 Facebook */ | ||
3 | #ifndef _BPF_SK_STORAGE_H | ||
4 | #define _BPF_SK_STORAGE_H | ||
5 | |||
6 | struct sock; | ||
7 | |||
8 | void bpf_sk_storage_free(struct sock *sk); | ||
9 | |||
10 | extern const struct bpf_func_proto bpf_sk_storage_get_proto; | ||
11 | extern const struct bpf_func_proto bpf_sk_storage_delete_proto; | ||
12 | |||
13 | #endif /* _BPF_SK_STORAGE_H */ | ||
diff --git a/include/net/sock.h b/include/net/sock.h index 784cd19d5ff7..4d208c0f9c14 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -236,6 +236,8 @@ struct sock_common { | |||
236 | /* public: */ | 236 | /* public: */ |
237 | }; | 237 | }; |
238 | 238 | ||
239 | struct bpf_sk_storage; | ||
240 | |||
239 | /** | 241 | /** |
240 | * struct sock - network layer representation of sockets | 242 | * struct sock - network layer representation of sockets |
241 | * @__sk_common: shared layout with inet_timewait_sock | 243 | * @__sk_common: shared layout with inet_timewait_sock |
@@ -510,6 +512,9 @@ struct sock { | |||
510 | #endif | 512 | #endif |
511 | void (*sk_destruct)(struct sock *sk); | 513 | void (*sk_destruct)(struct sock *sk); |
512 | struct sock_reuseport __rcu *sk_reuseport_cb; | 514 | struct sock_reuseport __rcu *sk_reuseport_cb; |
515 | #ifdef CONFIG_BPF_SYSCALL | ||
516 | struct bpf_sk_storage __rcu *sk_bpf_storage; | ||
517 | #endif | ||
513 | struct rcu_head sk_rcu; | 518 | struct rcu_head sk_rcu; |
514 | }; | 519 | }; |
515 | 520 | ||
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f7fa7a34a62d..72336bac7573 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
@@ -133,6 +133,7 @@ enum bpf_map_type { | |||
133 | BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, | 133 | BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, |
134 | BPF_MAP_TYPE_QUEUE, | 134 | BPF_MAP_TYPE_QUEUE, |
135 | BPF_MAP_TYPE_STACK, | 135 | BPF_MAP_TYPE_STACK, |
136 | BPF_MAP_TYPE_SK_STORAGE, | ||
136 | }; | 137 | }; |
137 | 138 | ||
138 | /* Note that tracing related programs such as | 139 | /* Note that tracing related programs such as |
@@ -2630,6 +2631,42 @@ union bpf_attr { | |||
2630 | * was provided. | 2631 | * was provided. |
2631 | * | 2632 | * |
2632 | * **-ERANGE** if resulting value was out of range. | 2633 | * **-ERANGE** if resulting value was out of range. |
2634 | * | ||
2635 | * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) | ||
2636 | * Description | ||
2637 | * Get a bpf-local-storage from a sk. | ||
2638 | * | ||
2639 | * Logically, it could be thought of getting the value from | ||
2640 | * a *map* with *sk* as the **key**. From this | ||
2641 | * perspective, the usage is not much different from | ||
2642 | * **bpf_map_lookup_elem(map, &sk)** except this | ||
2643 | * helper enforces the key must be a **bpf_fullsock()** | ||
2644 | * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. | ||
2645 | * | ||
2646 | * Underneath, the value is stored locally at *sk* instead of | ||
2647 | * the map. The *map* is used as the bpf-local-storage **type**. | ||
2648 | * The bpf-local-storage **type** (i.e. the *map*) is searched | ||
2649 | * against all bpf-local-storages residing at sk. | ||
2650 | * | ||
2651 | * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be | ||
2652 | * used such that a new bpf-local-storage will be | ||
2653 | * created if one does not exist. *value* can be used | ||
2654 | * together with BPF_SK_STORAGE_GET_F_CREATE to specify | ||
2655 | * the initial value of a bpf-local-storage. If *value* is | ||
2656 | * NULL, the new bpf-local-storage will be zero initialized. | ||
2657 | * Return | ||
2658 | * A bpf-local-storage pointer is returned on success. | ||
2659 | * | ||
2660 | * **NULL** if not found or there was an error in adding | ||
2661 | * a new bpf-local-storage. | ||
2662 | * | ||
2663 | * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) | ||
2664 | * Description | ||
2665 | * Delete a bpf-local-storage from a sk. | ||
2666 | * Return | ||
2667 | * 0 on success. | ||
2668 | * | ||
2669 | * **-ENOENT** if the bpf-local-storage cannot be found. | ||
2633 | */ | 2670 | */ |
2634 | #define __BPF_FUNC_MAPPER(FN) \ | 2671 | #define __BPF_FUNC_MAPPER(FN) \ |
2635 | FN(unspec), \ | 2672 | FN(unspec), \ |
@@ -2738,7 +2775,9 @@ union bpf_attr { | |||
2738 | FN(sysctl_get_new_value), \ | 2775 | FN(sysctl_get_new_value), \ |
2739 | FN(sysctl_set_new_value), \ | 2776 | FN(sysctl_set_new_value), \ |
2740 | FN(strtol), \ | 2777 | FN(strtol), \ |
2741 | FN(strtoul), | 2778 | FN(strtoul), \ |
2779 | FN(sk_storage_get), \ | ||
2780 | FN(sk_storage_delete), | ||
2742 | 2781 | ||
2743 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper | 2782 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper |
2744 | * function eBPF program intends to call | 2783 | * function eBPF program intends to call |
@@ -2814,6 +2853,9 @@ enum bpf_func_id { | |||
2814 | /* BPF_FUNC_sysctl_get_name flags. */ | 2853 | /* BPF_FUNC_sysctl_get_name flags. */ |
2815 | #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) | 2854 | #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) |
2816 | 2855 | ||
2856 | /* BPF_FUNC_sk_storage_get flags */ | ||
2857 | #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) | ||
2858 | |||
2817 | /* Mode for BPF_FUNC_skb_adjust_room helper. */ | 2859 | /* Mode for BPF_FUNC_skb_adjust_room helper. */ |
2818 | enum bpf_adj_room_mode { | 2860 | enum bpf_adj_room_mode { |
2819 | BPF_ADJ_ROOM_NET, | 2861 | BPF_ADJ_ROOM_NET, |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae141e745f92..ad3ccf82f31d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, | |||
526 | return -EACCES; | 526 | return -EACCES; |
527 | if (map->map_type != BPF_MAP_TYPE_HASH && | 527 | if (map->map_type != BPF_MAP_TYPE_HASH && |
528 | map->map_type != BPF_MAP_TYPE_ARRAY && | 528 | map->map_type != BPF_MAP_TYPE_ARRAY && |
529 | map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) | 529 | map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && |
530 | map->map_type != BPF_MAP_TYPE_SK_STORAGE) | ||
530 | return -ENOTSUPP; | 531 | return -ENOTSUPP; |
531 | if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > | 532 | if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > |
532 | map->value_size) { | 533 | map->value_size) { |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ef442c62c0e..271717246af3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -2543,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
2543 | 2543 | ||
2544 | if (arg_type == ARG_PTR_TO_MAP_KEY || | 2544 | if (arg_type == ARG_PTR_TO_MAP_KEY || |
2545 | arg_type == ARG_PTR_TO_MAP_VALUE || | 2545 | arg_type == ARG_PTR_TO_MAP_VALUE || |
2546 | arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { | 2546 | arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || |
2547 | arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { | ||
2547 | expected_type = PTR_TO_STACK; | 2548 | expected_type = PTR_TO_STACK; |
2548 | if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && | 2549 | if (register_is_null(reg) && |
2549 | type != expected_type) | 2550 | arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) |
2551 | /* final test in check_stack_boundary() */; | ||
2552 | else if (!type_is_pkt_pointer(type) && | ||
2553 | type != PTR_TO_MAP_VALUE && | ||
2554 | type != expected_type) | ||
2550 | goto err_type; | 2555 | goto err_type; |
2551 | } else if (arg_type == ARG_CONST_SIZE || | 2556 | } else if (arg_type == ARG_CONST_SIZE || |
2552 | arg_type == ARG_CONST_SIZE_OR_ZERO) { | 2557 | arg_type == ARG_CONST_SIZE_OR_ZERO) { |
@@ -2578,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
2578 | } | 2583 | } |
2579 | meta->ref_obj_id = reg->ref_obj_id; | 2584 | meta->ref_obj_id = reg->ref_obj_id; |
2580 | } | 2585 | } |
2586 | } else if (arg_type == ARG_PTR_TO_SOCKET) { | ||
2587 | expected_type = PTR_TO_SOCKET; | ||
2588 | if (type != expected_type) | ||
2589 | goto err_type; | ||
2581 | } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { | 2590 | } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { |
2582 | if (meta->func_id == BPF_FUNC_spin_lock) { | 2591 | if (meta->func_id == BPF_FUNC_spin_lock) { |
2583 | if (process_spin_lock(env, regno, true)) | 2592 | if (process_spin_lock(env, regno, true)) |
@@ -2635,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
2635 | meta->map_ptr->key_size, false, | 2644 | meta->map_ptr->key_size, false, |
2636 | NULL); | 2645 | NULL); |
2637 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE || | 2646 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE || |
2647 | (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && | ||
2648 | !register_is_null(reg)) || | ||
2638 | arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { | 2649 | arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { |
2639 | /* bpf_map_xxx(..., map_ptr, ..., value) call: | 2650 | /* bpf_map_xxx(..., map_ptr, ..., value) call: |
2640 | * check [value, value + map->value_size) validity | 2651 | * check [value, value + map->value_size) validity |
@@ -2784,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, | |||
2784 | func_id != BPF_FUNC_map_push_elem) | 2795 | func_id != BPF_FUNC_map_push_elem) |
2785 | goto error; | 2796 | goto error; |
2786 | break; | 2797 | break; |
2798 | case BPF_MAP_TYPE_SK_STORAGE: | ||
2799 | if (func_id != BPF_FUNC_sk_storage_get && | ||
2800 | func_id != BPF_FUNC_sk_storage_delete) | ||
2801 | goto error; | ||
2802 | break; | ||
2787 | default: | 2803 | default: |
2788 | break; | 2804 | break; |
2789 | } | 2805 | } |
@@ -2847,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, | |||
2847 | map->map_type != BPF_MAP_TYPE_STACK) | 2863 | map->map_type != BPF_MAP_TYPE_STACK) |
2848 | goto error; | 2864 | goto error; |
2849 | break; | 2865 | break; |
2866 | case BPF_FUNC_sk_storage_get: | ||
2867 | case BPF_FUNC_sk_storage_delete: | ||
2868 | if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) | ||
2869 | goto error; | ||
2870 | break; | ||
2850 | default: | 2871 | default: |
2851 | break; | 2872 | break; |
2852 | } | 2873 | } |
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6c4694ae4241..33e0dc168c16 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/etherdevice.h> | 10 | #include <linux/etherdevice.h> |
11 | #include <linux/filter.h> | 11 | #include <linux/filter.h> |
12 | #include <linux/sched/signal.h> | 12 | #include <linux/sched/signal.h> |
13 | #include <net/bpf_sk_storage.h> | ||
13 | #include <net/sock.h> | 14 | #include <net/sock.h> |
14 | #include <net/tcp.h> | 15 | #include <net/tcp.h> |
15 | 16 | ||
@@ -335,6 +336,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, | |||
335 | sizeof(struct __sk_buff)); | 336 | sizeof(struct __sk_buff)); |
336 | out: | 337 | out: |
337 | kfree_skb(skb); | 338 | kfree_skb(skb); |
339 | bpf_sk_storage_free(sk); | ||
338 | kfree(sk); | 340 | kfree(sk); |
339 | kfree(ctx); | 341 | kfree(ctx); |
340 | return ret; | 342 | return ret; |
diff --git a/net/core/Makefile b/net/core/Makefile index f97d6254e564..a104dc8faafc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile | |||
@@ -34,3 +34,4 @@ obj-$(CONFIG_HWBM) += hwbm.o | |||
34 | obj-$(CONFIG_NET_DEVLINK) += devlink.o | 34 | obj-$(CONFIG_NET_DEVLINK) += devlink.o |
35 | obj-$(CONFIG_GRO_CELLS) += gro_cells.o | 35 | obj-$(CONFIG_GRO_CELLS) += gro_cells.o |
36 | obj-$(CONFIG_FAILOVER) += failover.o | 36 | obj-$(CONFIG_FAILOVER) += failover.o |
37 | obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o | ||
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000000..a8e9ac71b22d --- /dev/null +++ b/net/core/bpf_sk_storage.c | |||
@@ -0,0 +1,804 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook */ | ||
3 | #include <linux/rculist.h> | ||
4 | #include <linux/list.h> | ||
5 | #include <linux/hash.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/spinlock.h> | ||
8 | #include <linux/bpf.h> | ||
9 | #include <net/bpf_sk_storage.h> | ||
10 | #include <net/sock.h> | ||
11 | #include <uapi/linux/btf.h> | ||
12 | |||
13 | static atomic_t cache_idx; | ||
14 | |||
15 | struct bucket { | ||
16 | struct hlist_head list; | ||
17 | raw_spinlock_t lock; | ||
18 | }; | ||
19 | |||
20 | /* Thp map is not the primary owner of a bpf_sk_storage_elem. | ||
21 | * Instead, the sk->sk_bpf_storage is. | ||
22 | * | ||
23 | * The map (bpf_sk_storage_map) is for two purposes | ||
24 | * 1. Define the size of the "sk local storage". It is | ||
25 | * the map's value_size. | ||
26 | * | ||
27 | * 2. Maintain a list to keep track of all elems such | ||
28 | * that they can be cleaned up during the map destruction. | ||
29 | * | ||
30 | * When a bpf local storage is being looked up for a | ||
31 | * particular sk, the "bpf_map" pointer is actually used | ||
32 | * as the "key" to search in the list of elem in | ||
33 | * sk->sk_bpf_storage. | ||
34 | * | ||
35 | * Hence, consider sk->sk_bpf_storage is the mini-map | ||
36 | * with the "bpf_map" pointer as the searching key. | ||
37 | */ | ||
38 | struct bpf_sk_storage_map { | ||
39 | struct bpf_map map; | ||
40 | /* Lookup elem does not require accessing the map. | ||
41 | * | ||
42 | * Updating/Deleting requires a bucket lock to | ||
43 | * link/unlink the elem from the map. Having | ||
44 | * multiple buckets to improve contention. | ||
45 | */ | ||
46 | struct bucket *buckets; | ||
47 | u32 bucket_log; | ||
48 | u16 elem_size; | ||
49 | u16 cache_idx; | ||
50 | }; | ||
51 | |||
52 | struct bpf_sk_storage_data { | ||
53 | /* smap is used as the searching key when looking up | ||
54 | * from sk->sk_bpf_storage. | ||
55 | * | ||
56 | * Put it in the same cacheline as the data to minimize | ||
57 | * the number of cachelines access during the cache hit case. | ||
58 | */ | ||
59 | struct bpf_sk_storage_map __rcu *smap; | ||
60 | u8 data[0] __aligned(8); | ||
61 | }; | ||
62 | |||
63 | /* Linked to bpf_sk_storage and bpf_sk_storage_map */ | ||
64 | struct bpf_sk_storage_elem { | ||
65 | struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ | ||
66 | struct hlist_node snode; /* Linked to bpf_sk_storage */ | ||
67 | struct bpf_sk_storage __rcu *sk_storage; | ||
68 | struct rcu_head rcu; | ||
69 | /* 8 bytes hole */ | ||
70 | /* The data is stored in aother cacheline to minimize | ||
71 | * the number of cachelines access during a cache hit. | ||
72 | */ | ||
73 | struct bpf_sk_storage_data sdata ____cacheline_aligned; | ||
74 | }; | ||
75 | |||
76 | #define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) | ||
77 | #define SDATA(_SELEM) (&(_SELEM)->sdata) | ||
78 | #define BPF_SK_STORAGE_CACHE_SIZE 16 | ||
79 | |||
80 | struct bpf_sk_storage { | ||
81 | struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; | ||
82 | struct hlist_head list; /* List of bpf_sk_storage_elem */ | ||
83 | struct sock *sk; /* The sk that owns the the above "list" of | ||
84 | * bpf_sk_storage_elem. | ||
85 | */ | ||
86 | struct rcu_head rcu; | ||
87 | raw_spinlock_t lock; /* Protect adding/removing from the "list" */ | ||
88 | }; | ||
89 | |||
90 | static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, | ||
91 | struct bpf_sk_storage_elem *selem) | ||
92 | { | ||
93 | return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; | ||
94 | } | ||
95 | |||
96 | static int omem_charge(struct sock *sk, unsigned int size) | ||
97 | { | ||
98 | /* same check as in sock_kmalloc() */ | ||
99 | if (size <= sysctl_optmem_max && | ||
100 | atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { | ||
101 | atomic_add(size, &sk->sk_omem_alloc); | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | return -ENOMEM; | ||
106 | } | ||
107 | |||
108 | static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) | ||
109 | { | ||
110 | return !hlist_unhashed(&selem->snode); | ||
111 | } | ||
112 | |||
113 | static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) | ||
114 | { | ||
115 | return !hlist_unhashed(&selem->map_node); | ||
116 | } | ||
117 | |||
118 | static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, | ||
119 | struct sock *sk, void *value, | ||
120 | bool charge_omem) | ||
121 | { | ||
122 | struct bpf_sk_storage_elem *selem; | ||
123 | |||
124 | if (charge_omem && omem_charge(sk, smap->elem_size)) | ||
125 | return NULL; | ||
126 | |||
127 | selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); | ||
128 | if (selem) { | ||
129 | if (value) | ||
130 | memcpy(SDATA(selem)->data, value, smap->map.value_size); | ||
131 | return selem; | ||
132 | } | ||
133 | |||
134 | if (charge_omem) | ||
135 | atomic_sub(smap->elem_size, &sk->sk_omem_alloc); | ||
136 | |||
137 | return NULL; | ||
138 | } | ||
139 | |||
140 | /* sk_storage->lock must be held and selem->sk_storage == sk_storage. | ||
141 | * The caller must ensure selem->smap is still valid to be | ||
142 | * dereferenced for its smap->elem_size and smap->cache_idx. | ||
143 | */ | ||
144 | static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, | ||
145 | struct bpf_sk_storage_elem *selem, | ||
146 | bool uncharge_omem) | ||
147 | { | ||
148 | struct bpf_sk_storage_map *smap; | ||
149 | bool free_sk_storage; | ||
150 | struct sock *sk; | ||
151 | |||
152 | smap = rcu_dereference(SDATA(selem)->smap); | ||
153 | sk = sk_storage->sk; | ||
154 | |||
155 | /* All uncharging on sk->sk_omem_alloc must be done first. | ||
156 | * sk may be freed once the last selem is unlinked from sk_storage. | ||
157 | */ | ||
158 | if (uncharge_omem) | ||
159 | atomic_sub(smap->elem_size, &sk->sk_omem_alloc); | ||
160 | |||
161 | free_sk_storage = hlist_is_singular_node(&selem->snode, | ||
162 | &sk_storage->list); | ||
163 | if (free_sk_storage) { | ||
164 | atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); | ||
165 | sk_storage->sk = NULL; | ||
166 | /* After this RCU_INIT, sk may be freed and cannot be used */ | ||
167 | RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); | ||
168 | |||
169 | /* sk_storage is not freed now. sk_storage->lock is | ||
170 | * still held and raw_spin_unlock_bh(&sk_storage->lock) | ||
171 | * will be done by the caller. | ||
172 | * | ||
173 | * Although the unlock will be done under | ||
174 | * rcu_read_lock(), it is more intutivie to | ||
175 | * read if kfree_rcu(sk_storage, rcu) is done | ||
176 | * after the raw_spin_unlock_bh(&sk_storage->lock). | ||
177 | * | ||
178 | * Hence, a "bool free_sk_storage" is returned | ||
179 | * to the caller which then calls the kfree_rcu() | ||
180 | * after unlock. | ||
181 | */ | ||
182 | } | ||
183 | hlist_del_init_rcu(&selem->snode); | ||
184 | if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == | ||
185 | SDATA(selem)) | ||
186 | RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); | ||
187 | |||
188 | kfree_rcu(selem, rcu); | ||
189 | |||
190 | return free_sk_storage; | ||
191 | } | ||
192 | |||
193 | static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) | ||
194 | { | ||
195 | struct bpf_sk_storage *sk_storage; | ||
196 | bool free_sk_storage = false; | ||
197 | |||
198 | if (unlikely(!selem_linked_to_sk(selem))) | ||
199 | /* selem has already been unlinked from sk */ | ||
200 | return; | ||
201 | |||
202 | sk_storage = rcu_dereference(selem->sk_storage); | ||
203 | raw_spin_lock_bh(&sk_storage->lock); | ||
204 | if (likely(selem_linked_to_sk(selem))) | ||
205 | free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); | ||
206 | raw_spin_unlock_bh(&sk_storage->lock); | ||
207 | |||
208 | if (free_sk_storage) | ||
209 | kfree_rcu(sk_storage, rcu); | ||
210 | } | ||
211 | |||
212 | /* sk_storage->lock must be held and sk_storage->list cannot be empty */ | ||
213 | static void __selem_link_sk(struct bpf_sk_storage *sk_storage, | ||
214 | struct bpf_sk_storage_elem *selem) | ||
215 | { | ||
216 | RCU_INIT_POINTER(selem->sk_storage, sk_storage); | ||
217 | hlist_add_head(&selem->snode, &sk_storage->list); | ||
218 | } | ||
219 | |||
220 | static void selem_unlink_map(struct bpf_sk_storage_elem *selem) | ||
221 | { | ||
222 | struct bpf_sk_storage_map *smap; | ||
223 | struct bucket *b; | ||
224 | |||
225 | if (unlikely(!selem_linked_to_map(selem))) | ||
226 | /* selem has already be unlinked from smap */ | ||
227 | return; | ||
228 | |||
229 | smap = rcu_dereference(SDATA(selem)->smap); | ||
230 | b = select_bucket(smap, selem); | ||
231 | raw_spin_lock_bh(&b->lock); | ||
232 | if (likely(selem_linked_to_map(selem))) | ||
233 | hlist_del_init_rcu(&selem->map_node); | ||
234 | raw_spin_unlock_bh(&b->lock); | ||
235 | } | ||
236 | |||
237 | static void selem_link_map(struct bpf_sk_storage_map *smap, | ||
238 | struct bpf_sk_storage_elem *selem) | ||
239 | { | ||
240 | struct bucket *b = select_bucket(smap, selem); | ||
241 | |||
242 | raw_spin_lock_bh(&b->lock); | ||
243 | RCU_INIT_POINTER(SDATA(selem)->smap, smap); | ||
244 | hlist_add_head_rcu(&selem->map_node, &b->list); | ||
245 | raw_spin_unlock_bh(&b->lock); | ||
246 | } | ||
247 | |||
248 | static void selem_unlink(struct bpf_sk_storage_elem *selem) | ||
249 | { | ||
250 | /* Always unlink from map before unlinking from sk_storage | ||
251 | * because selem will be freed after successfully unlinked from | ||
252 | * the sk_storage. | ||
253 | */ | ||
254 | selem_unlink_map(selem); | ||
255 | selem_unlink_sk(selem); | ||
256 | } | ||
257 | |||
258 | static struct bpf_sk_storage_data * | ||
259 | __sk_storage_lookup(struct bpf_sk_storage *sk_storage, | ||
260 | struct bpf_sk_storage_map *smap, | ||
261 | bool cacheit_lockit) | ||
262 | { | ||
263 | struct bpf_sk_storage_data *sdata; | ||
264 | struct bpf_sk_storage_elem *selem; | ||
265 | |||
266 | /* Fast path (cache hit) */ | ||
267 | sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); | ||
268 | if (sdata && rcu_access_pointer(sdata->smap) == smap) | ||
269 | return sdata; | ||
270 | |||
271 | /* Slow path (cache miss) */ | ||
272 | hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) | ||
273 | if (rcu_access_pointer(SDATA(selem)->smap) == smap) | ||
274 | break; | ||
275 | |||
276 | if (!selem) | ||
277 | return NULL; | ||
278 | |||
279 | sdata = SDATA(selem); | ||
280 | if (cacheit_lockit) { | ||
281 | /* spinlock is needed to avoid racing with the | ||
282 | * parallel delete. Otherwise, publishing an already | ||
283 | * deleted sdata to the cache will become a use-after-free | ||
284 | * problem in the next __sk_storage_lookup(). | ||
285 | */ | ||
286 | raw_spin_lock_bh(&sk_storage->lock); | ||
287 | if (selem_linked_to_sk(selem)) | ||
288 | rcu_assign_pointer(sk_storage->cache[smap->cache_idx], | ||
289 | sdata); | ||
290 | raw_spin_unlock_bh(&sk_storage->lock); | ||
291 | } | ||
292 | |||
293 | return sdata; | ||
294 | } | ||
295 | |||
296 | static struct bpf_sk_storage_data * | ||
297 | sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) | ||
298 | { | ||
299 | struct bpf_sk_storage *sk_storage; | ||
300 | struct bpf_sk_storage_map *smap; | ||
301 | |||
302 | sk_storage = rcu_dereference(sk->sk_bpf_storage); | ||
303 | if (!sk_storage) | ||
304 | return NULL; | ||
305 | |||
306 | smap = (struct bpf_sk_storage_map *)map; | ||
307 | return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); | ||
308 | } | ||
309 | |||
310 | static int check_flags(const struct bpf_sk_storage_data *old_sdata, | ||
311 | u64 map_flags) | ||
312 | { | ||
313 | if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) | ||
314 | /* elem already exists */ | ||
315 | return -EEXIST; | ||
316 | |||
317 | if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) | ||
318 | /* elem doesn't exist, cannot update it */ | ||
319 | return -ENOENT; | ||
320 | |||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | static int sk_storage_alloc(struct sock *sk, | ||
325 | struct bpf_sk_storage_map *smap, | ||
326 | struct bpf_sk_storage_elem *first_selem) | ||
327 | { | ||
328 | struct bpf_sk_storage *prev_sk_storage, *sk_storage; | ||
329 | int err; | ||
330 | |||
331 | err = omem_charge(sk, sizeof(*sk_storage)); | ||
332 | if (err) | ||
333 | return err; | ||
334 | |||
335 | sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); | ||
336 | if (!sk_storage) { | ||
337 | err = -ENOMEM; | ||
338 | goto uncharge; | ||
339 | } | ||
340 | INIT_HLIST_HEAD(&sk_storage->list); | ||
341 | raw_spin_lock_init(&sk_storage->lock); | ||
342 | sk_storage->sk = sk; | ||
343 | |||
344 | __selem_link_sk(sk_storage, first_selem); | ||
345 | selem_link_map(smap, first_selem); | ||
346 | /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. | ||
347 | * Hence, atomic ops is used to set sk->sk_bpf_storage | ||
348 | * from NULL to the newly allocated sk_storage ptr. | ||
349 | * | ||
350 | * From now on, the sk->sk_bpf_storage pointer is protected | ||
351 | * by the sk_storage->lock. Hence, when freeing | ||
352 | * the sk->sk_bpf_storage, the sk_storage->lock must | ||
353 | * be held before setting sk->sk_bpf_storage to NULL. | ||
354 | */ | ||
355 | prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, | ||
356 | NULL, sk_storage); | ||
357 | if (unlikely(prev_sk_storage)) { | ||
358 | selem_unlink_map(first_selem); | ||
359 | err = -EAGAIN; | ||
360 | goto uncharge; | ||
361 | |||
362 | /* Note that even first_selem was linked to smap's | ||
363 | * bucket->list, first_selem can be freed immediately | ||
364 | * (instead of kfree_rcu) because | ||
365 | * bpf_sk_storage_map_free() does a | ||
366 | * synchronize_rcu() before walking the bucket->list. | ||
367 | * Hence, no one is accessing selem from the | ||
368 | * bucket->list under rcu_read_lock(). | ||
369 | */ | ||
370 | } | ||
371 | |||
372 | return 0; | ||
373 | |||
374 | uncharge: | ||
375 | kfree(sk_storage); | ||
376 | atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); | ||
377 | return err; | ||
378 | } | ||
379 | |||
380 | /* sk cannot be going away because it is linking new elem | ||
381 | * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). | ||
382 | * Otherwise, it will become a leak (and other memory issues | ||
383 | * during map destruction). | ||
384 | */ | ||
385 | static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, | ||
386 | struct bpf_map *map, | ||
387 | void *value, | ||
388 | u64 map_flags) | ||
389 | { | ||
390 | struct bpf_sk_storage_data *old_sdata = NULL; | ||
391 | struct bpf_sk_storage_elem *selem; | ||
392 | struct bpf_sk_storage *sk_storage; | ||
393 | struct bpf_sk_storage_map *smap; | ||
394 | int err; | ||
395 | |||
396 | /* BPF_EXIST and BPF_NOEXIST cannot be both set */ | ||
397 | if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || | ||
398 | /* BPF_F_LOCK can only be used in a value with spin_lock */ | ||
399 | unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) | ||
400 | return ERR_PTR(-EINVAL); | ||
401 | |||
402 | smap = (struct bpf_sk_storage_map *)map; | ||
403 | sk_storage = rcu_dereference(sk->sk_bpf_storage); | ||
404 | if (!sk_storage || hlist_empty(&sk_storage->list)) { | ||
405 | /* Very first elem for this sk */ | ||
406 | err = check_flags(NULL, map_flags); | ||
407 | if (err) | ||
408 | return ERR_PTR(err); | ||
409 | |||
410 | selem = selem_alloc(smap, sk, value, true); | ||
411 | if (!selem) | ||
412 | return ERR_PTR(-ENOMEM); | ||
413 | |||
414 | err = sk_storage_alloc(sk, smap, selem); | ||
415 | if (err) { | ||
416 | kfree(selem); | ||
417 | atomic_sub(smap->elem_size, &sk->sk_omem_alloc); | ||
418 | return ERR_PTR(err); | ||
419 | } | ||
420 | |||
421 | return SDATA(selem); | ||
422 | } | ||
423 | |||
424 | if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { | ||
425 | /* Hoping to find an old_sdata to do inline update | ||
426 | * such that it can avoid taking the sk_storage->lock | ||
427 | * and changing the lists. | ||
428 | */ | ||
429 | old_sdata = __sk_storage_lookup(sk_storage, smap, false); | ||
430 | err = check_flags(old_sdata, map_flags); | ||
431 | if (err) | ||
432 | return ERR_PTR(err); | ||
433 | if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { | ||
434 | copy_map_value_locked(map, old_sdata->data, | ||
435 | value, false); | ||
436 | return old_sdata; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | raw_spin_lock_bh(&sk_storage->lock); | ||
441 | |||
442 | /* Recheck sk_storage->list under sk_storage->lock */ | ||
443 | if (unlikely(hlist_empty(&sk_storage->list))) { | ||
444 | /* A parallel del is happening and sk_storage is going | ||
445 | * away. It has just been checked before, so very | ||
446 | * unlikely. Return instead of retry to keep things | ||
447 | * simple. | ||
448 | */ | ||
449 | err = -EAGAIN; | ||
450 | goto unlock_err; | ||
451 | } | ||
452 | |||
453 | old_sdata = __sk_storage_lookup(sk_storage, smap, false); | ||
454 | err = check_flags(old_sdata, map_flags); | ||
455 | if (err) | ||
456 | goto unlock_err; | ||
457 | |||
458 | if (old_sdata && (map_flags & BPF_F_LOCK)) { | ||
459 | copy_map_value_locked(map, old_sdata->data, value, false); | ||
460 | selem = SELEM(old_sdata); | ||
461 | goto unlock; | ||
462 | } | ||
463 | |||
464 | /* sk_storage->lock is held. Hence, we are sure | ||
465 | * we can unlink and uncharge the old_sdata successfully | ||
466 | * later. Hence, instead of charging the new selem now | ||
467 | * and then uncharge the old selem later (which may cause | ||
468 | * a potential but unnecessary charge failure), avoid taking | ||
469 | * a charge at all here (the "!old_sdata" check) and the | ||
470 | * old_sdata will not be uncharged later during __selem_unlink_sk(). | ||
471 | */ | ||
472 | selem = selem_alloc(smap, sk, value, !old_sdata); | ||
473 | if (!selem) { | ||
474 | err = -ENOMEM; | ||
475 | goto unlock_err; | ||
476 | } | ||
477 | |||
478 | /* First, link the new selem to the map */ | ||
479 | selem_link_map(smap, selem); | ||
480 | |||
481 | /* Second, link (and publish) the new selem to sk_storage */ | ||
482 | __selem_link_sk(sk_storage, selem); | ||
483 | |||
484 | /* Third, remove old selem, SELEM(old_sdata) */ | ||
485 | if (old_sdata) { | ||
486 | selem_unlink_map(SELEM(old_sdata)); | ||
487 | __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); | ||
488 | } | ||
489 | |||
490 | unlock: | ||
491 | raw_spin_unlock_bh(&sk_storage->lock); | ||
492 | return SDATA(selem); | ||
493 | |||
494 | unlock_err: | ||
495 | raw_spin_unlock_bh(&sk_storage->lock); | ||
496 | return ERR_PTR(err); | ||
497 | } | ||
498 | |||
499 | static int sk_storage_delete(struct sock *sk, struct bpf_map *map) | ||
500 | { | ||
501 | struct bpf_sk_storage_data *sdata; | ||
502 | |||
503 | sdata = sk_storage_lookup(sk, map, false); | ||
504 | if (!sdata) | ||
505 | return -ENOENT; | ||
506 | |||
507 | selem_unlink(SELEM(sdata)); | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | /* Called by __sk_destruct() */ | ||
513 | void bpf_sk_storage_free(struct sock *sk) | ||
514 | { | ||
515 | struct bpf_sk_storage_elem *selem; | ||
516 | struct bpf_sk_storage *sk_storage; | ||
517 | bool free_sk_storage = false; | ||
518 | struct hlist_node *n; | ||
519 | |||
520 | rcu_read_lock(); | ||
521 | sk_storage = rcu_dereference(sk->sk_bpf_storage); | ||
522 | if (!sk_storage) { | ||
523 | rcu_read_unlock(); | ||
524 | return; | ||
525 | } | ||
526 | |||
527 | /* Netiher the bpf_prog nor the bpf-map's syscall | ||
528 | * could be modifying the sk_storage->list now. | ||
529 | * Thus, no elem can be added-to or deleted-from the | ||
530 | * sk_storage->list by the bpf_prog or by the bpf-map's syscall. | ||
531 | * | ||
532 | * It is racing with bpf_sk_storage_map_free() alone | ||
533 | * when unlinking elem from the sk_storage->list and | ||
534 | * the map's bucket->list. | ||
535 | */ | ||
536 | raw_spin_lock_bh(&sk_storage->lock); | ||
537 | hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { | ||
538 | /* Always unlink from map before unlinking from | ||
539 | * sk_storage. | ||
540 | */ | ||
541 | selem_unlink_map(selem); | ||
542 | free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); | ||
543 | } | ||
544 | raw_spin_unlock_bh(&sk_storage->lock); | ||
545 | rcu_read_unlock(); | ||
546 | |||
547 | if (free_sk_storage) | ||
548 | kfree_rcu(sk_storage, rcu); | ||
549 | } | ||
550 | |||
551 | static void bpf_sk_storage_map_free(struct bpf_map *map) | ||
552 | { | ||
553 | struct bpf_sk_storage_elem *selem; | ||
554 | struct bpf_sk_storage_map *smap; | ||
555 | struct bucket *b; | ||
556 | unsigned int i; | ||
557 | |||
558 | smap = (struct bpf_sk_storage_map *)map; | ||
559 | |||
560 | synchronize_rcu(); | ||
561 | |||
562 | /* bpf prog and the userspace can no longer access this map | ||
563 | * now. No new selem (of this map) can be added | ||
564 | * to the sk->sk_bpf_storage or to the map bucket's list. | ||
565 | * | ||
566 | * The elem of this map can be cleaned up here | ||
567 | * or | ||
568 | * by bpf_sk_storage_free() during __sk_destruct(). | ||
569 | */ | ||
570 | for (i = 0; i < (1U << smap->bucket_log); i++) { | ||
571 | b = &smap->buckets[i]; | ||
572 | |||
573 | rcu_read_lock(); | ||
574 | /* No one is adding to b->list now */ | ||
575 | while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), | ||
576 | struct bpf_sk_storage_elem, | ||
577 | map_node))) { | ||
578 | selem_unlink(selem); | ||
579 | cond_resched_rcu(); | ||
580 | } | ||
581 | rcu_read_unlock(); | ||
582 | } | ||
583 | |||
584 | /* bpf_sk_storage_free() may still need to access the map. | ||
585 | * e.g. bpf_sk_storage_free() has unlinked selem from the map | ||
586 | * which then made the above while((selem = ...)) loop | ||
587 | * exited immediately. | ||
588 | * | ||
589 | * However, the bpf_sk_storage_free() still needs to access | ||
590 | * the smap->elem_size to do the uncharging in | ||
591 | * __selem_unlink_sk(). | ||
592 | * | ||
593 | * Hence, wait another rcu grace period for the | ||
594 | * bpf_sk_storage_free() to finish. | ||
595 | */ | ||
596 | synchronize_rcu(); | ||
597 | |||
598 | kvfree(smap->buckets); | ||
599 | kfree(map); | ||
600 | } | ||
601 | |||
602 | static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) | ||
603 | { | ||
604 | if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || | ||
605 | attr->key_size != sizeof(int) || !attr->value_size || | ||
606 | /* Enforce BTF for userspace sk dumping */ | ||
607 | !attr->btf_key_type_id || !attr->btf_value_type_id) | ||
608 | return -EINVAL; | ||
609 | |||
610 | if (!capable(CAP_SYS_ADMIN)) | ||
611 | return -EPERM; | ||
612 | |||
613 | if (attr->value_size >= KMALLOC_MAX_SIZE - | ||
614 | MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || | ||
615 | /* U16_MAX is much more than enough for sk local storage | ||
616 | * considering a tcp_sock is ~2k. | ||
617 | */ | ||
618 | attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) | ||
619 | return -E2BIG; | ||
620 | |||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) | ||
625 | { | ||
626 | struct bpf_sk_storage_map *smap; | ||
627 | unsigned int i; | ||
628 | u32 nbuckets; | ||
629 | u64 cost; | ||
630 | |||
631 | smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); | ||
632 | if (!smap) | ||
633 | return ERR_PTR(-ENOMEM); | ||
634 | bpf_map_init_from_attr(&smap->map, attr); | ||
635 | |||
636 | smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); | ||
637 | nbuckets = 1U << smap->bucket_log; | ||
638 | smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, | ||
639 | GFP_USER | __GFP_NOWARN); | ||
640 | if (!smap->buckets) { | ||
641 | kfree(smap); | ||
642 | return ERR_PTR(-ENOMEM); | ||
643 | } | ||
644 | cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); | ||
645 | |||
646 | for (i = 0; i < nbuckets; i++) { | ||
647 | INIT_HLIST_HEAD(&smap->buckets[i].list); | ||
648 | raw_spin_lock_init(&smap->buckets[i].lock); | ||
649 | } | ||
650 | |||
651 | smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; | ||
652 | smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % | ||
653 | BPF_SK_STORAGE_CACHE_SIZE; | ||
654 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
655 | |||
656 | return &smap->map; | ||
657 | } | ||
658 | |||
659 | static int notsupp_get_next_key(struct bpf_map *map, void *key, | ||
660 | void *next_key) | ||
661 | { | ||
662 | return -ENOTSUPP; | ||
663 | } | ||
664 | |||
665 | static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, | ||
666 | const struct btf *btf, | ||
667 | const struct btf_type *key_type, | ||
668 | const struct btf_type *value_type) | ||
669 | { | ||
670 | u32 int_data; | ||
671 | |||
672 | if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) | ||
673 | return -EINVAL; | ||
674 | |||
675 | int_data = *(u32 *)(key_type + 1); | ||
676 | if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) | ||
677 | return -EINVAL; | ||
678 | |||
679 | return 0; | ||
680 | } | ||
681 | |||
682 | static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) | ||
683 | { | ||
684 | struct bpf_sk_storage_data *sdata; | ||
685 | struct socket *sock; | ||
686 | int fd, err; | ||
687 | |||
688 | fd = *(int *)key; | ||
689 | sock = sockfd_lookup(fd, &err); | ||
690 | if (sock) { | ||
691 | sdata = sk_storage_lookup(sock->sk, map, true); | ||
692 | sockfd_put(sock); | ||
693 | return sdata ? sdata->data : NULL; | ||
694 | } | ||
695 | |||
696 | return ERR_PTR(err); | ||
697 | } | ||
698 | |||
699 | static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, | ||
700 | void *value, u64 map_flags) | ||
701 | { | ||
702 | struct bpf_sk_storage_data *sdata; | ||
703 | struct socket *sock; | ||
704 | int fd, err; | ||
705 | |||
706 | fd = *(int *)key; | ||
707 | sock = sockfd_lookup(fd, &err); | ||
708 | if (sock) { | ||
709 | sdata = sk_storage_update(sock->sk, map, value, map_flags); | ||
710 | sockfd_put(sock); | ||
711 | return IS_ERR(sdata) ? PTR_ERR(sdata) : 0; | ||
712 | } | ||
713 | |||
714 | return err; | ||
715 | } | ||
716 | |||
717 | static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) | ||
718 | { | ||
719 | struct socket *sock; | ||
720 | int fd, err; | ||
721 | |||
722 | fd = *(int *)key; | ||
723 | sock = sockfd_lookup(fd, &err); | ||
724 | if (sock) { | ||
725 | err = sk_storage_delete(sock->sk, map); | ||
726 | sockfd_put(sock); | ||
727 | return err; | ||
728 | } | ||
729 | |||
730 | return err; | ||
731 | } | ||
732 | |||
733 | BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, | ||
734 | void *, value, u64, flags) | ||
735 | { | ||
736 | struct bpf_sk_storage_data *sdata; | ||
737 | |||
738 | if (flags > BPF_SK_STORAGE_GET_F_CREATE) | ||
739 | return (unsigned long)NULL; | ||
740 | |||
741 | sdata = sk_storage_lookup(sk, map, true); | ||
742 | if (sdata) | ||
743 | return (unsigned long)sdata->data; | ||
744 | |||
745 | if (flags == BPF_SK_STORAGE_GET_F_CREATE && | ||
746 | /* Cannot add new elem to a going away sk. | ||
747 | * Otherwise, the new elem may become a leak | ||
748 | * (and also other memory issues during map | ||
749 | * destruction). | ||
750 | */ | ||
751 | refcount_inc_not_zero(&sk->sk_refcnt)) { | ||
752 | sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); | ||
753 | /* sk must be a fullsock (guaranteed by verifier), | ||
754 | * so sock_gen_put() is unnecessary. | ||
755 | */ | ||
756 | sock_put(sk); | ||
757 | return IS_ERR(sdata) ? | ||
758 | (unsigned long)NULL : (unsigned long)sdata->data; | ||
759 | } | ||
760 | |||
761 | return (unsigned long)NULL; | ||
762 | } | ||
763 | |||
764 | BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) | ||
765 | { | ||
766 | if (refcount_inc_not_zero(&sk->sk_refcnt)) { | ||
767 | int err; | ||
768 | |||
769 | err = sk_storage_delete(sk, map); | ||
770 | sock_put(sk); | ||
771 | return err; | ||
772 | } | ||
773 | |||
774 | return -ENOENT; | ||
775 | } | ||
776 | |||
777 | const struct bpf_map_ops sk_storage_map_ops = { | ||
778 | .map_alloc_check = bpf_sk_storage_map_alloc_check, | ||
779 | .map_alloc = bpf_sk_storage_map_alloc, | ||
780 | .map_free = bpf_sk_storage_map_free, | ||
781 | .map_get_next_key = notsupp_get_next_key, | ||
782 | .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, | ||
783 | .map_update_elem = bpf_fd_sk_storage_update_elem, | ||
784 | .map_delete_elem = bpf_fd_sk_storage_delete_elem, | ||
785 | .map_check_btf = bpf_sk_storage_map_check_btf, | ||
786 | }; | ||
787 | |||
788 | const struct bpf_func_proto bpf_sk_storage_get_proto = { | ||
789 | .func = bpf_sk_storage_get, | ||
790 | .gpl_only = false, | ||
791 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
792 | .arg1_type = ARG_CONST_MAP_PTR, | ||
793 | .arg2_type = ARG_PTR_TO_SOCKET, | ||
794 | .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, | ||
795 | .arg4_type = ARG_ANYTHING, | ||
796 | }; | ||
797 | |||
798 | const struct bpf_func_proto bpf_sk_storage_delete_proto = { | ||
799 | .func = bpf_sk_storage_delete, | ||
800 | .gpl_only = false, | ||
801 | .ret_type = RET_INTEGER, | ||
802 | .arg1_type = ARG_CONST_MAP_PTR, | ||
803 | .arg2_type = ARG_PTR_TO_SOCKET, | ||
804 | }; | ||
diff --git a/net/core/filter.c b/net/core/filter.c index 2f88baf39cc2..27b0dc01dc3f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c | |||
@@ -75,6 +75,7 @@ | |||
75 | #include <net/seg6_local.h> | 75 | #include <net/seg6_local.h> |
76 | #include <net/lwtunnel.h> | 76 | #include <net/lwtunnel.h> |
77 | #include <net/ipv6_stubs.h> | 77 | #include <net/ipv6_stubs.h> |
78 | #include <net/bpf_sk_storage.h> | ||
78 | 79 | ||
79 | /** | 80 | /** |
80 | * sk_filter_trim_cap - run a packet through a socket filter | 81 | * sk_filter_trim_cap - run a packet through a socket filter |
@@ -5903,6 +5904,9 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
5903 | } | 5904 | } |
5904 | } | 5905 | } |
5905 | 5906 | ||
5907 | const struct bpf_func_proto bpf_sk_storage_get_proto __weak; | ||
5908 | const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; | ||
5909 | |||
5906 | static const struct bpf_func_proto * | 5910 | static const struct bpf_func_proto * |
5907 | cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | 5911 | cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) |
5908 | { | 5912 | { |
@@ -5911,6 +5915,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
5911 | return &bpf_get_local_storage_proto; | 5915 | return &bpf_get_local_storage_proto; |
5912 | case BPF_FUNC_sk_fullsock: | 5916 | case BPF_FUNC_sk_fullsock: |
5913 | return &bpf_sk_fullsock_proto; | 5917 | return &bpf_sk_fullsock_proto; |
5918 | case BPF_FUNC_sk_storage_get: | ||
5919 | return &bpf_sk_storage_get_proto; | ||
5920 | case BPF_FUNC_sk_storage_delete: | ||
5921 | return &bpf_sk_storage_delete_proto; | ||
5914 | #ifdef CONFIG_INET | 5922 | #ifdef CONFIG_INET |
5915 | case BPF_FUNC_tcp_sock: | 5923 | case BPF_FUNC_tcp_sock: |
5916 | return &bpf_tcp_sock_proto; | 5924 | return &bpf_tcp_sock_proto; |
@@ -5992,6 +6000,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
5992 | return &bpf_skb_fib_lookup_proto; | 6000 | return &bpf_skb_fib_lookup_proto; |
5993 | case BPF_FUNC_sk_fullsock: | 6001 | case BPF_FUNC_sk_fullsock: |
5994 | return &bpf_sk_fullsock_proto; | 6002 | return &bpf_sk_fullsock_proto; |
6003 | case BPF_FUNC_sk_storage_get: | ||
6004 | return &bpf_sk_storage_get_proto; | ||
6005 | case BPF_FUNC_sk_storage_delete: | ||
6006 | return &bpf_sk_storage_delete_proto; | ||
5995 | #ifdef CONFIG_XFRM | 6007 | #ifdef CONFIG_XFRM |
5996 | case BPF_FUNC_skb_get_xfrm_state: | 6008 | case BPF_FUNC_skb_get_xfrm_state: |
5997 | return &bpf_skb_get_xfrm_state_proto; | 6009 | return &bpf_skb_get_xfrm_state_proto; |
diff --git a/net/core/sock.c b/net/core/sock.c index 443b98d05f1e..9773be724aa9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -137,6 +137,7 @@ | |||
137 | 137 | ||
138 | #include <linux/filter.h> | 138 | #include <linux/filter.h> |
139 | #include <net/sock_reuseport.h> | 139 | #include <net/sock_reuseport.h> |
140 | #include <net/bpf_sk_storage.h> | ||
140 | 141 | ||
141 | #include <trace/events/sock.h> | 142 | #include <trace/events/sock.h> |
142 | 143 | ||
@@ -1709,6 +1710,10 @@ static void __sk_destruct(struct rcu_head *head) | |||
1709 | 1710 | ||
1710 | sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); | 1711 | sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); |
1711 | 1712 | ||
1713 | #ifdef CONFIG_BPF_SYSCALL | ||
1714 | bpf_sk_storage_free(sk); | ||
1715 | #endif | ||
1716 | |||
1712 | if (atomic_read(&sk->sk_omem_alloc)) | 1717 | if (atomic_read(&sk->sk_omem_alloc)) |
1713 | pr_debug("%s: optmem leakage (%d bytes) detected\n", | 1718 | pr_debug("%s: optmem leakage (%d bytes) detected\n", |
1714 | __func__, atomic_read(&sk->sk_omem_alloc)); | 1719 | __func__, atomic_read(&sk->sk_omem_alloc)); |