diff options
author | Alexei Starovoitov <ast@plumgrid.com> | 2014-09-26 03:16:57 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-26 15:05:14 -0400 |
commit | 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 (patch) | |
tree | 12f09f26bee9813ae33cfc195582c41e94b2e4e9 | |
parent | 4a8e320c929991c9480a7b936512c57ea02d87b2 (diff) |
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
'union bpf_attr' is backwards compatible with future extensions.
More details in Documentation/networking/filter.txt and in manpage
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/filter.txt | 39 | ||||
-rw-r--r-- | include/linux/bpf.h | 41 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 23 | ||||
-rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 169 |
5 files changed, 273 insertions, 1 deletions
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 014e0319a5c4..4a01d71785e9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt | |||
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg. | |||
1001 | Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads | 1001 | Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads |
1002 | 32-bit immediate value into a register. | 1002 | 32-bit immediate value into a register. |
1003 | 1003 | ||
1004 | eBPF maps | ||
1005 | --------- | ||
1006 | 'maps' is a generic storage of different types for sharing data between kernel | ||
1007 | and userspace. | ||
1008 | |||
1009 | The maps are accessed from user space via BPF syscall, which has commands: | ||
1010 | - create a map with given type and attributes | ||
1011 | map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) | ||
1012 | using attr->map_type, attr->key_size, attr->value_size, attr->max_entries | ||
1013 | returns process-local file descriptor or negative error | ||
1014 | |||
1015 | - lookup key in a given map | ||
1016 | err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) | ||
1017 | using attr->map_fd, attr->key, attr->value | ||
1018 | returns zero and stores found elem into value or negative error | ||
1019 | |||
1020 | - create or update key/value pair in a given map | ||
1021 | err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) | ||
1022 | using attr->map_fd, attr->key, attr->value | ||
1023 | returns zero or negative error | ||
1024 | |||
1025 | - find and delete element by key in a given map | ||
1026 | err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) | ||
1027 | using attr->map_fd, attr->key | ||
1028 | |||
1029 | - to delete map: close(fd) | ||
1030 | Exiting process will delete maps automatically | ||
1031 | |||
1032 | userspace programs use this syscall to create/access maps that eBPF programs | ||
1033 | are concurrently updating. | ||
1034 | |||
1035 | maps can have different types: hash, array, bloom filter, radix-tree, etc. | ||
1036 | |||
1037 | The map is defined by: | ||
1038 | . type | ||
1039 | . max number of elements | ||
1040 | . key size in bytes | ||
1041 | . value size in bytes | ||
1042 | |||
1004 | Testing | 1043 | Testing |
1005 | ------- | 1044 | ------- |
1006 | 1045 | ||
diff --git a/include/linux/bpf.h b/include/linux/bpf.h new file mode 100644 index 000000000000..48014a71f0fe --- /dev/null +++ b/include/linux/bpf.h | |||
@@ -0,0 +1,41 @@ | |||
1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #ifndef _LINUX_BPF_H | ||
8 | #define _LINUX_BPF_H 1 | ||
9 | |||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <linux/workqueue.h> | ||
12 | |||
13 | struct bpf_map; | ||
14 | |||
15 | /* map is generic key/value storage optionally accesible by eBPF programs */ | ||
16 | struct bpf_map_ops { | ||
17 | /* funcs callable from userspace (via syscall) */ | ||
18 | struct bpf_map *(*map_alloc)(union bpf_attr *attr); | ||
19 | void (*map_free)(struct bpf_map *); | ||
20 | }; | ||
21 | |||
22 | struct bpf_map { | ||
23 | atomic_t refcnt; | ||
24 | enum bpf_map_type map_type; | ||
25 | u32 key_size; | ||
26 | u32 value_size; | ||
27 | u32 max_entries; | ||
28 | struct bpf_map_ops *ops; | ||
29 | struct work_struct work; | ||
30 | }; | ||
31 | |||
32 | struct bpf_map_type_list { | ||
33 | struct list_head list_node; | ||
34 | struct bpf_map_ops *ops; | ||
35 | enum bpf_map_type type; | ||
36 | }; | ||
37 | |||
38 | void bpf_register_map_type(struct bpf_map_type_list *tl); | ||
39 | void bpf_map_put(struct bpf_map *map); | ||
40 | |||
41 | #endif /* _LINUX_BPF_H */ | ||
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 479ed0b6be16..f58a10f9670c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
@@ -62,4 +62,27 @@ struct bpf_insn { | |||
62 | __s32 imm; /* signed immediate constant */ | 62 | __s32 imm; /* signed immediate constant */ |
63 | }; | 63 | }; |
64 | 64 | ||
65 | /* BPF syscall commands */ | ||
66 | enum bpf_cmd { | ||
67 | /* create a map with given type and attributes | ||
68 | * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) | ||
69 | * returns fd or negative error | ||
70 | * map is deleted when fd is closed | ||
71 | */ | ||
72 | BPF_MAP_CREATE, | ||
73 | }; | ||
74 | |||
75 | enum bpf_map_type { | ||
76 | BPF_MAP_TYPE_UNSPEC, | ||
77 | }; | ||
78 | |||
79 | union bpf_attr { | ||
80 | struct { /* anonymous struct used by BPF_MAP_CREATE command */ | ||
81 | __u32 map_type; /* one of enum bpf_map_type */ | ||
82 | __u32 key_size; /* size of key in bytes */ | ||
83 | __u32 value_size; /* size of value in bytes */ | ||
84 | __u32 max_entries; /* max number of entries in a map */ | ||
85 | }; | ||
86 | } __attribute__((aligned(8))); | ||
87 | |||
65 | #endif /* _UAPI__LINUX_BPF_H__ */ | 88 | #endif /* _UAPI__LINUX_BPF_H__ */ |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1 +1 @@ | |||
obj-y := core.o | obj-y := core.o syscall.o | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c | |||
@@ -0,0 +1,169 @@ | |||
1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | #include <linux/bpf.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/anon_inodes.h> | ||
16 | |||
17 | static LIST_HEAD(bpf_map_types); | ||
18 | |||
19 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | ||
20 | { | ||
21 | struct bpf_map_type_list *tl; | ||
22 | struct bpf_map *map; | ||
23 | |||
24 | list_for_each_entry(tl, &bpf_map_types, list_node) { | ||
25 | if (tl->type == attr->map_type) { | ||
26 | map = tl->ops->map_alloc(attr); | ||
27 | if (IS_ERR(map)) | ||
28 | return map; | ||
29 | map->ops = tl->ops; | ||
30 | map->map_type = attr->map_type; | ||
31 | return map; | ||
32 | } | ||
33 | } | ||
34 | return ERR_PTR(-EINVAL); | ||
35 | } | ||
36 | |||
37 | /* boot time registration of different map implementations */ | ||
38 | void bpf_register_map_type(struct bpf_map_type_list *tl) | ||
39 | { | ||
40 | list_add(&tl->list_node, &bpf_map_types); | ||
41 | } | ||
42 | |||
43 | /* called from workqueue */ | ||
44 | static void bpf_map_free_deferred(struct work_struct *work) | ||
45 | { | ||
46 | struct bpf_map *map = container_of(work, struct bpf_map, work); | ||
47 | |||
48 | /* implementation dependent freeing */ | ||
49 | map->ops->map_free(map); | ||
50 | } | ||
51 | |||
52 | /* decrement map refcnt and schedule it for freeing via workqueue | ||
53 | * (unrelying map implementation ops->map_free() might sleep) | ||
54 | */ | ||
55 | void bpf_map_put(struct bpf_map *map) | ||
56 | { | ||
57 | if (atomic_dec_and_test(&map->refcnt)) { | ||
58 | INIT_WORK(&map->work, bpf_map_free_deferred); | ||
59 | schedule_work(&map->work); | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static int bpf_map_release(struct inode *inode, struct file *filp) | ||
64 | { | ||
65 | struct bpf_map *map = filp->private_data; | ||
66 | |||
67 | bpf_map_put(map); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static const struct file_operations bpf_map_fops = { | ||
72 | .release = bpf_map_release, | ||
73 | }; | ||
74 | |||
75 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | ||
76 | #define CHECK_ATTR(CMD) \ | ||
77 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | ||
78 | sizeof(attr->CMD##_LAST_FIELD), 0, \ | ||
79 | sizeof(*attr) - \ | ||
80 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | ||
81 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | ||
82 | |||
83 | #define BPF_MAP_CREATE_LAST_FIELD max_entries | ||
84 | /* called via syscall */ | ||
85 | static int map_create(union bpf_attr *attr) | ||
86 | { | ||
87 | struct bpf_map *map; | ||
88 | int err; | ||
89 | |||
90 | err = CHECK_ATTR(BPF_MAP_CREATE); | ||
91 | if (err) | ||
92 | return -EINVAL; | ||
93 | |||
94 | /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | ||
95 | map = find_and_alloc_map(attr); | ||
96 | if (IS_ERR(map)) | ||
97 | return PTR_ERR(map); | ||
98 | |||
99 | atomic_set(&map->refcnt, 1); | ||
100 | |||
101 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | ||
102 | |||
103 | if (err < 0) | ||
104 | /* failed to allocate fd */ | ||
105 | goto free_map; | ||
106 | |||
107 | return err; | ||
108 | |||
109 | free_map: | ||
110 | map->ops->map_free(map); | ||
111 | return err; | ||
112 | } | ||
113 | |||
114 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | ||
115 | { | ||
116 | union bpf_attr attr = {}; | ||
117 | int err; | ||
118 | |||
119 | /* the syscall is limited to root temporarily. This restriction will be | ||
120 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
121 | * this restriction, since it may pass kernel data to user space | ||
122 | */ | ||
123 | if (!capable(CAP_SYS_ADMIN)) | ||
124 | return -EPERM; | ||
125 | |||
126 | if (!access_ok(VERIFY_READ, uattr, 1)) | ||
127 | return -EFAULT; | ||
128 | |||
129 | if (size > PAGE_SIZE) /* silly large */ | ||
130 | return -E2BIG; | ||
131 | |||
132 | /* If we're handed a bigger struct than we know of, | ||
133 | * ensure all the unknown bits are 0 - i.e. new | ||
134 | * user-space does not rely on any kernel feature | ||
135 | * extensions we dont know about yet. | ||
136 | */ | ||
137 | if (size > sizeof(attr)) { | ||
138 | unsigned char __user *addr; | ||
139 | unsigned char __user *end; | ||
140 | unsigned char val; | ||
141 | |||
142 | addr = (void __user *)uattr + sizeof(attr); | ||
143 | end = (void __user *)uattr + size; | ||
144 | |||
145 | for (; addr < end; addr++) { | ||
146 | err = get_user(val, addr); | ||
147 | if (err) | ||
148 | return err; | ||
149 | if (val) | ||
150 | return -E2BIG; | ||
151 | } | ||
152 | size = sizeof(attr); | ||
153 | } | ||
154 | |||
155 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | ||
156 | if (copy_from_user(&attr, uattr, size) != 0) | ||
157 | return -EFAULT; | ||
158 | |||
159 | switch (cmd) { | ||
160 | case BPF_MAP_CREATE: | ||
161 | err = map_create(&attr); | ||
162 | break; | ||
163 | default: | ||
164 | err = -EINVAL; | ||
165 | break; | ||
166 | } | ||
167 | |||
168 | return err; | ||
169 | } | ||