aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorMartin KaFai Lau <kafai@fb.com>2018-08-08 04:01:24 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-08-10 19:58:46 -0400
commit5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c (patch)
tree3ae127970e7e14a70948c989f6a702695767a6a6 /include
parent736b46027eb4a4c602d3b8b93d2f48c9facbd915 (diff)
bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h28
-rw-r--r--include/linux/bpf_types.h3
-rw-r--r--include/uapi/linux/bpf.h1
3 files changed, 32 insertions, 0 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..db11662faea6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
524} 524}
525 525
526struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); 526struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
527int array_map_alloc_check(union bpf_attr *attr);
527 528
528#else /* !CONFIG_BPF_SYSCALL */ 529#else /* !CONFIG_BPF_SYSCALL */
529static inline struct bpf_prog *bpf_prog_get(u32 ufd) 530static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map)
769} 770}
770#endif 771#endif
771 772
773#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
774void bpf_sk_reuseport_detach(struct sock *sk);
775int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
776 void *value);
777int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
778 void *value, u64 map_flags);
779#else
780static inline void bpf_sk_reuseport_detach(struct sock *sk)
781{
782}
783
784#ifdef CONFIG_BPF_SYSCALL
785static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
786 void *key, void *value)
787{
788 return -EOPNOTSUPP;
789}
790
791static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
792 void *key, void *value,
793 u64 map_flags)
794{
795 return -EOPNOTSUPP;
796}
797#endif /* CONFIG_BPF_SYSCALL */
798#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
799
772/* verifier prototypes for helper functions called from eBPF programs */ 800/* verifier prototypes for helper functions called from eBPF programs */
773extern const struct bpf_func_proto bpf_map_lookup_elem_proto; 801extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
774extern const struct bpf_func_proto bpf_map_update_elem_proto; 802extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index add08be53b6f..14fd6c02d258 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
60#if defined(CONFIG_XDP_SOCKETS) 60#if defined(CONFIG_XDP_SOCKETS)
61BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) 61BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
62#endif 62#endif
63#ifdef CONFIG_INET
64BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
65#endif
63#endif 66#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd5758dc35d3..40f584bc7da0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
126 BPF_MAP_TYPE_XSKMAP, 126 BPF_MAP_TYPE_XSKMAP,
127 BPF_MAP_TYPE_SOCKHASH, 127 BPF_MAP_TYPE_SOCKHASH,
128 BPF_MAP_TYPE_CGROUP_STORAGE, 128 BPF_MAP_TYPE_CGROUP_STORAGE,
129 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
129}; 130};
130 131
131enum bpf_prog_type { 132enum bpf_prog_type {