diff options
author | Alexei Starovoitov <ast@plumgrid.com> | 2014-09-26 03:16:57 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-26 15:05:14 -0400 |
commit | 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 (patch) | |
tree | 12f09f26bee9813ae33cfc195582c41e94b2e4e9 /kernel/bpf | |
parent | 4a8e320c929991c9480a7b936512c57ea02d87b2 (diff) |
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
'union bpf_attr' is backwards compatible with future extensions.
More details in Documentation/networking/filter.txt and in manpage
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 169 |
2 files changed, 170 insertions, 1 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1 +1 @@ | |||
obj-y := core.o | obj-y := core.o syscall.o | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c | |||
@@ -0,0 +1,169 @@ | |||
1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | #include <linux/bpf.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/anon_inodes.h> | ||
16 | |||
17 | static LIST_HEAD(bpf_map_types); | ||
18 | |||
19 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | ||
20 | { | ||
21 | struct bpf_map_type_list *tl; | ||
22 | struct bpf_map *map; | ||
23 | |||
24 | list_for_each_entry(tl, &bpf_map_types, list_node) { | ||
25 | if (tl->type == attr->map_type) { | ||
26 | map = tl->ops->map_alloc(attr); | ||
27 | if (IS_ERR(map)) | ||
28 | return map; | ||
29 | map->ops = tl->ops; | ||
30 | map->map_type = attr->map_type; | ||
31 | return map; | ||
32 | } | ||
33 | } | ||
34 | return ERR_PTR(-EINVAL); | ||
35 | } | ||
36 | |||
37 | /* boot time registration of different map implementations */ | ||
38 | void bpf_register_map_type(struct bpf_map_type_list *tl) | ||
39 | { | ||
40 | list_add(&tl->list_node, &bpf_map_types); | ||
41 | } | ||
42 | |||
43 | /* called from workqueue */ | ||
44 | static void bpf_map_free_deferred(struct work_struct *work) | ||
45 | { | ||
46 | struct bpf_map *map = container_of(work, struct bpf_map, work); | ||
47 | |||
48 | /* implementation dependent freeing */ | ||
49 | map->ops->map_free(map); | ||
50 | } | ||
51 | |||
52 | /* decrement map refcnt and schedule it for freeing via workqueue | ||
53 | * (unrelying map implementation ops->map_free() might sleep) | ||
54 | */ | ||
55 | void bpf_map_put(struct bpf_map *map) | ||
56 | { | ||
57 | if (atomic_dec_and_test(&map->refcnt)) { | ||
58 | INIT_WORK(&map->work, bpf_map_free_deferred); | ||
59 | schedule_work(&map->work); | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static int bpf_map_release(struct inode *inode, struct file *filp) | ||
64 | { | ||
65 | struct bpf_map *map = filp->private_data; | ||
66 | |||
67 | bpf_map_put(map); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static const struct file_operations bpf_map_fops = { | ||
72 | .release = bpf_map_release, | ||
73 | }; | ||
74 | |||
75 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | ||
76 | #define CHECK_ATTR(CMD) \ | ||
77 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | ||
78 | sizeof(attr->CMD##_LAST_FIELD), 0, \ | ||
79 | sizeof(*attr) - \ | ||
80 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | ||
81 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | ||
82 | |||
83 | #define BPF_MAP_CREATE_LAST_FIELD max_entries | ||
84 | /* called via syscall */ | ||
85 | static int map_create(union bpf_attr *attr) | ||
86 | { | ||
87 | struct bpf_map *map; | ||
88 | int err; | ||
89 | |||
90 | err = CHECK_ATTR(BPF_MAP_CREATE); | ||
91 | if (err) | ||
92 | return -EINVAL; | ||
93 | |||
94 | /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | ||
95 | map = find_and_alloc_map(attr); | ||
96 | if (IS_ERR(map)) | ||
97 | return PTR_ERR(map); | ||
98 | |||
99 | atomic_set(&map->refcnt, 1); | ||
100 | |||
101 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | ||
102 | |||
103 | if (err < 0) | ||
104 | /* failed to allocate fd */ | ||
105 | goto free_map; | ||
106 | |||
107 | return err; | ||
108 | |||
109 | free_map: | ||
110 | map->ops->map_free(map); | ||
111 | return err; | ||
112 | } | ||
113 | |||
114 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | ||
115 | { | ||
116 | union bpf_attr attr = {}; | ||
117 | int err; | ||
118 | |||
119 | /* the syscall is limited to root temporarily. This restriction will be | ||
120 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
121 | * this restriction, since it may pass kernel data to user space | ||
122 | */ | ||
123 | if (!capable(CAP_SYS_ADMIN)) | ||
124 | return -EPERM; | ||
125 | |||
126 | if (!access_ok(VERIFY_READ, uattr, 1)) | ||
127 | return -EFAULT; | ||
128 | |||
129 | if (size > PAGE_SIZE) /* silly large */ | ||
130 | return -E2BIG; | ||
131 | |||
132 | /* If we're handed a bigger struct than we know of, | ||
133 | * ensure all the unknown bits are 0 - i.e. new | ||
134 | * user-space does not rely on any kernel feature | ||
135 | * extensions we dont know about yet. | ||
136 | */ | ||
137 | if (size > sizeof(attr)) { | ||
138 | unsigned char __user *addr; | ||
139 | unsigned char __user *end; | ||
140 | unsigned char val; | ||
141 | |||
142 | addr = (void __user *)uattr + sizeof(attr); | ||
143 | end = (void __user *)uattr + size; | ||
144 | |||
145 | for (; addr < end; addr++) { | ||
146 | err = get_user(val, addr); | ||
147 | if (err) | ||
148 | return err; | ||
149 | if (val) | ||
150 | return -E2BIG; | ||
151 | } | ||
152 | size = sizeof(attr); | ||
153 | } | ||
154 | |||
155 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | ||
156 | if (copy_from_user(&attr, uattr, size) != 0) | ||
157 | return -EFAULT; | ||
158 | |||
159 | switch (cmd) { | ||
160 | case BPF_MAP_CREATE: | ||
161 | err = map_create(&attr); | ||
162 | break; | ||
163 | default: | ||
164 | err = -EINVAL; | ||
165 | break; | ||
166 | } | ||
167 | |||
168 | return err; | ||
169 | } | ||