diff options
| author | Alexei Starovoitov <ast@plumgrid.com> | 2014-09-26 03:16:57 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2014-09-26 15:05:14 -0400 |
| commit | 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 (patch) | |
| tree | 12f09f26bee9813ae33cfc195582c41e94b2e4e9 /kernel/bpf | |
| parent | 4a8e320c929991c9480a7b936512c57ea02d87b2 (diff) | |
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
'union bpf_attr' is backwards compatible with future extensions.
More details in Documentation/networking/filter.txt and in manpage
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 169 |
2 files changed, 170 insertions, 1 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1 +1 @@ | |||
| obj-y := core.o | obj-y := core.o syscall.o | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c | |||
| @@ -0,0 +1,169 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/syscalls.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/anon_inodes.h> | ||
| 16 | |||
| 17 | static LIST_HEAD(bpf_map_types); | ||
| 18 | |||
| 19 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | ||
| 20 | { | ||
| 21 | struct bpf_map_type_list *tl; | ||
| 22 | struct bpf_map *map; | ||
| 23 | |||
| 24 | list_for_each_entry(tl, &bpf_map_types, list_node) { | ||
| 25 | if (tl->type == attr->map_type) { | ||
| 26 | map = tl->ops->map_alloc(attr); | ||
| 27 | if (IS_ERR(map)) | ||
| 28 | return map; | ||
| 29 | map->ops = tl->ops; | ||
| 30 | map->map_type = attr->map_type; | ||
| 31 | return map; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | return ERR_PTR(-EINVAL); | ||
| 35 | } | ||
| 36 | |||
| 37 | /* boot time registration of different map implementations */ | ||
| 38 | void bpf_register_map_type(struct bpf_map_type_list *tl) | ||
| 39 | { | ||
| 40 | list_add(&tl->list_node, &bpf_map_types); | ||
| 41 | } | ||
| 42 | |||
| 43 | /* called from workqueue */ | ||
| 44 | static void bpf_map_free_deferred(struct work_struct *work) | ||
| 45 | { | ||
| 46 | struct bpf_map *map = container_of(work, struct bpf_map, work); | ||
| 47 | |||
| 48 | /* implementation dependent freeing */ | ||
| 49 | map->ops->map_free(map); | ||
| 50 | } | ||
| 51 | |||
| 52 | /* decrement map refcnt and schedule it for freeing via workqueue | ||
| 53 | * (unrelying map implementation ops->map_free() might sleep) | ||
| 54 | */ | ||
| 55 | void bpf_map_put(struct bpf_map *map) | ||
| 56 | { | ||
| 57 | if (atomic_dec_and_test(&map->refcnt)) { | ||
| 58 | INIT_WORK(&map->work, bpf_map_free_deferred); | ||
| 59 | schedule_work(&map->work); | ||
| 60 | } | ||
| 61 | } | ||
| 62 | |||
| 63 | static int bpf_map_release(struct inode *inode, struct file *filp) | ||
| 64 | { | ||
| 65 | struct bpf_map *map = filp->private_data; | ||
| 66 | |||
| 67 | bpf_map_put(map); | ||
| 68 | return 0; | ||
| 69 | } | ||
| 70 | |||
| 71 | static const struct file_operations bpf_map_fops = { | ||
| 72 | .release = bpf_map_release, | ||
| 73 | }; | ||
| 74 | |||
| 75 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | ||
| 76 | #define CHECK_ATTR(CMD) \ | ||
| 77 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | ||
| 78 | sizeof(attr->CMD##_LAST_FIELD), 0, \ | ||
| 79 | sizeof(*attr) - \ | ||
| 80 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | ||
| 81 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | ||
| 82 | |||
| 83 | #define BPF_MAP_CREATE_LAST_FIELD max_entries | ||
| 84 | /* called via syscall */ | ||
| 85 | static int map_create(union bpf_attr *attr) | ||
| 86 | { | ||
| 87 | struct bpf_map *map; | ||
| 88 | int err; | ||
| 89 | |||
| 90 | err = CHECK_ATTR(BPF_MAP_CREATE); | ||
| 91 | if (err) | ||
| 92 | return -EINVAL; | ||
| 93 | |||
| 94 | /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | ||
| 95 | map = find_and_alloc_map(attr); | ||
| 96 | if (IS_ERR(map)) | ||
| 97 | return PTR_ERR(map); | ||
| 98 | |||
| 99 | atomic_set(&map->refcnt, 1); | ||
| 100 | |||
| 101 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | ||
| 102 | |||
| 103 | if (err < 0) | ||
| 104 | /* failed to allocate fd */ | ||
| 105 | goto free_map; | ||
| 106 | |||
| 107 | return err; | ||
| 108 | |||
| 109 | free_map: | ||
| 110 | map->ops->map_free(map); | ||
| 111 | return err; | ||
| 112 | } | ||
| 113 | |||
| 114 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | ||
| 115 | { | ||
| 116 | union bpf_attr attr = {}; | ||
| 117 | int err; | ||
| 118 | |||
| 119 | /* the syscall is limited to root temporarily. This restriction will be | ||
| 120 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
| 121 | * this restriction, since it may pass kernel data to user space | ||
| 122 | */ | ||
| 123 | if (!capable(CAP_SYS_ADMIN)) | ||
| 124 | return -EPERM; | ||
| 125 | |||
| 126 | if (!access_ok(VERIFY_READ, uattr, 1)) | ||
| 127 | return -EFAULT; | ||
| 128 | |||
| 129 | if (size > PAGE_SIZE) /* silly large */ | ||
| 130 | return -E2BIG; | ||
| 131 | |||
| 132 | /* If we're handed a bigger struct than we know of, | ||
| 133 | * ensure all the unknown bits are 0 - i.e. new | ||
| 134 | * user-space does not rely on any kernel feature | ||
| 135 | * extensions we dont know about yet. | ||
| 136 | */ | ||
| 137 | if (size > sizeof(attr)) { | ||
| 138 | unsigned char __user *addr; | ||
| 139 | unsigned char __user *end; | ||
| 140 | unsigned char val; | ||
| 141 | |||
| 142 | addr = (void __user *)uattr + sizeof(attr); | ||
| 143 | end = (void __user *)uattr + size; | ||
| 144 | |||
| 145 | for (; addr < end; addr++) { | ||
| 146 | err = get_user(val, addr); | ||
| 147 | if (err) | ||
| 148 | return err; | ||
| 149 | if (val) | ||
| 150 | return -E2BIG; | ||
| 151 | } | ||
| 152 | size = sizeof(attr); | ||
| 153 | } | ||
| 154 | |||
| 155 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | ||
| 156 | if (copy_from_user(&attr, uattr, size) != 0) | ||
| 157 | return -EFAULT; | ||
| 158 | |||
| 159 | switch (cmd) { | ||
| 160 | case BPF_MAP_CREATE: | ||
| 161 | err = map_create(&attr); | ||
| 162 | break; | ||
| 163 | default: | ||
| 164 | err = -EINVAL; | ||
| 165 | break; | ||
| 166 | } | ||
| 167 | |||
| 168 | return err; | ||
| 169 | } | ||
