aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf/syscall.c
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2015-10-29 09:58:09 -0400
committerDavid S. Miller <davem@davemloft.net>2015-11-02 22:48:39 -0500
commitb2197755b2633e164a439682fb05a9b5ea48f706 (patch)
tree71d9694754b0e4511e7cec0c2f57c130e96e71fb /kernel/bpf/syscall.c
parente9d8afa90b789b07d414637ab557d169d6b2b84e (diff)
bpf: add support for persistent maps/progs
This work adds support for "persistent" eBPF maps/programs. The term "persistent" is to be understood that maps/programs have a facility that lets them survive process termination. This is desired by various eBPF subsystem users. Just to name one example: tc classifier/action. Whenever tc parses the ELF object, extracts and loads maps/progs into the kernel, these file descriptors will be out of reach after the tc instance exits. So a subsequent tc invocation won't be able to access/relocate on this resource, and therefore maps cannot easily be shared, f.e. between the ingress and egress networking data path. The current workaround is that Unix domain sockets (UDS) need to be instrumented in order to pass the created eBPF map/program file descriptors to a third party management daemon through UDS' socket passing facility. This makes it a bit complicated to deploy shared eBPF maps or programs (programs f.e. for tail calls) among various processes. We've been brainstorming on how we could tackle this issue and various approches have been tried out so far, which can be read up further in the below reference. The architecture we eventually ended up with is a minimal file system that can hold map/prog objects. The file system is a per mount namespace singleton, and the default mount point is /sys/fs/bpf/. Any subsequent mounts within a given namespace will point to the same instance. The file system allows for creating a user-defined directory structure. The objects for maps/progs are created/fetched through bpf(2) with two new commands (BPF_OBJ_PIN/BPF_OBJ_GET). I.e. a bpf file descriptor along with a pathname is being passed to bpf(2) that in turn creates (we call it eBPF object pinning) the file system nodes. Only the pathname is being passed to bpf(2) for getting a new BPF file descriptor to an existing node. The user can use that to access maps and progs later on, through bpf(2). Removal of file system nodes is being managed through normal VFS functions such as unlink(2), etc. The file system code is kept to a very minimum and can be further extended later on. The next step I'm working on is to add dump eBPF map/prog commands to bpf(2), so that a specification from a given file descriptor can be retrieved. This can be used by things like CRIU but also applications can inspect the meta data after calling BPF_OBJ_GET. Big thanks also to Alexei and Hannes who significantly contributed in the design discussion that eventually let us end up with this architecture here. Reference: https://lkml.org/lkml/2015/10/15/925 Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r--kernel/bpf/syscall.c30
1 files changed, 27 insertions, 3 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d7783cb04d86..0d3313d02a7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -111,7 +111,7 @@ static const struct file_operations bpf_map_fops = {
111 .release = bpf_map_release, 111 .release = bpf_map_release,
112}; 112};
113 113
114static int bpf_map_new_fd(struct bpf_map *map) 114int bpf_map_new_fd(struct bpf_map *map)
115{ 115{
116 return anon_inode_getfd("bpf-map", &bpf_map_fops, map, 116 return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
117 O_RDWR | O_CLOEXEC); 117 O_RDWR | O_CLOEXEC);
@@ -174,7 +174,7 @@ struct bpf_map *__bpf_map_get(struct fd f)
174 return f.file->private_data; 174 return f.file->private_data;
175} 175}
176 176
177static struct bpf_map *bpf_map_get(u32 ufd) 177struct bpf_map *bpf_map_get(u32 ufd)
178{ 178{
179 struct fd f = fdget(ufd); 179 struct fd f = fdget(ufd);
180 struct bpf_map *map; 180 struct bpf_map *map;
@@ -548,7 +548,7 @@ static const struct file_operations bpf_prog_fops = {
548 .release = bpf_prog_release, 548 .release = bpf_prog_release,
549}; 549};
550 550
551static int bpf_prog_new_fd(struct bpf_prog *prog) 551int bpf_prog_new_fd(struct bpf_prog *prog)
552{ 552{
553 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, 553 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
554 O_RDWR | O_CLOEXEC); 554 O_RDWR | O_CLOEXEC);
@@ -674,6 +674,24 @@ free_prog_nouncharge:
674 return err; 674 return err;
675} 675}
676 676
677#define BPF_OBJ_LAST_FIELD bpf_fd
678
679static int bpf_obj_pin(const union bpf_attr *attr)
680{
681 if (CHECK_ATTR(BPF_OBJ))
682 return -EINVAL;
683
684 return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
685}
686
687static int bpf_obj_get(const union bpf_attr *attr)
688{
689 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
690 return -EINVAL;
691
692 return bpf_obj_get_user(u64_to_ptr(attr->pathname));
693}
694
677SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 695SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
678{ 696{
679 union bpf_attr attr = {}; 697 union bpf_attr attr = {};
@@ -734,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
734 case BPF_PROG_LOAD: 752 case BPF_PROG_LOAD:
735 err = bpf_prog_load(&attr); 753 err = bpf_prog_load(&attr);
736 break; 754 break;
755 case BPF_OBJ_PIN:
756 err = bpf_obj_pin(&attr);
757 break;
758 case BPF_OBJ_GET:
759 err = bpf_obj_get(&attr);
760 break;
737 default: 761 default:
738 err = -EINVAL; 762 err = -EINVAL;
739 break; 763 break;