aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@plumgrid.com>2014-09-26 03:16:57 -0400
committerDavid S. Miller <davem@davemloft.net>2014-09-26 15:05:14 -0400
commit99c55f7d47c0dc6fc64729f37bf435abf43f4c60 (patch)
tree12f09f26bee9813ae33cfc195582c41e94b2e4e9
parent4a8e320c929991c9480a7b936512c57ea02d87b2 (diff)
bpf: introduce BPF syscall and maps
BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/filter.txt39
-rw-r--r--include/linux/bpf.h41
-rw-r--r--include/uapi/linux/bpf.h23
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/syscall.c169
5 files changed, 273 insertions, 1 deletions
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 014e0319a5c4..4a01d71785e9 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
1001Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 1001Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
100232-bit immediate value into a register. 100232-bit immediate value into a register.
1003 1003
1004eBPF maps
1005---------
1006'maps' is a generic storage of different types for sharing data between kernel
1007and userspace.
1008
1009The maps are accessed from user space via BPF syscall, which has commands:
1010- create a map with given type and attributes
1011 map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
1012 using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
1013 returns process-local file descriptor or negative error
1014
1015- lookup key in a given map
1016 err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
1017 using attr->map_fd, attr->key, attr->value
1018 returns zero and stores found elem into value or negative error
1019
1020- create or update key/value pair in a given map
1021 err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
1022 using attr->map_fd, attr->key, attr->value
1023 returns zero or negative error
1024
1025- find and delete element by key in a given map
1026 err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
1027 using attr->map_fd, attr->key
1028
1029- to delete map: close(fd)
1030 Exiting process will delete maps automatically
1031
1032userspace programs use this syscall to create/access maps that eBPF programs
1033are concurrently updating.
1034
1035maps can have different types: hash, array, bloom filter, radix-tree, etc.
1036
1037The map is defined by:
1038 . type
1039 . max number of elements
1040 . key size in bytes
1041 . value size in bytes
1042
1004Testing 1043Testing
1005------- 1044-------
1006 1045
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
new file mode 100644
index 000000000000..48014a71f0fe
--- /dev/null
+++ b/include/linux/bpf.h
@@ -0,0 +1,41 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#ifndef _LINUX_BPF_H
8#define _LINUX_BPF_H 1
9
10#include <uapi/linux/bpf.h>
11#include <linux/workqueue.h>
12
13struct bpf_map;
14
15/* map is generic key/value storage optionally accesible by eBPF programs */
16struct bpf_map_ops {
17 /* funcs callable from userspace (via syscall) */
18 struct bpf_map *(*map_alloc)(union bpf_attr *attr);
19 void (*map_free)(struct bpf_map *);
20};
21
22struct bpf_map {
23 atomic_t refcnt;
24 enum bpf_map_type map_type;
25 u32 key_size;
26 u32 value_size;
27 u32 max_entries;
28 struct bpf_map_ops *ops;
29 struct work_struct work;
30};
31
32struct bpf_map_type_list {
33 struct list_head list_node;
34 struct bpf_map_ops *ops;
35 enum bpf_map_type type;
36};
37
38void bpf_register_map_type(struct bpf_map_type_list *tl);
39void bpf_map_put(struct bpf_map *map);
40
41#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 479ed0b6be16..f58a10f9670c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -62,4 +62,27 @@ struct bpf_insn {
62 __s32 imm; /* signed immediate constant */ 62 __s32 imm; /* signed immediate constant */
63}; 63};
64 64
65/* BPF syscall commands */
66enum bpf_cmd {
67 /* create a map with given type and attributes
68 * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
69 * returns fd or negative error
70 * map is deleted when fd is closed
71 */
72 BPF_MAP_CREATE,
73};
74
75enum bpf_map_type {
76 BPF_MAP_TYPE_UNSPEC,
77};
78
79union bpf_attr {
80 struct { /* anonymous struct used by BPF_MAP_CREATE command */
81 __u32 map_type; /* one of enum bpf_map_type */
82 __u32 key_size; /* size of key in bytes */
83 __u32 value_size; /* size of value in bytes */
84 __u32 max_entries; /* max number of entries in a map */
85 };
86} __attribute__((aligned(8)));
87
65#endif /* _UAPI__LINUX_BPF_H__ */ 88#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..e9f7334ed07a 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
obj-y := core.o obj-y := core.o syscall.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..428a0e23adc0
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,169 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/syscalls.h>
14#include <linux/slab.h>
15#include <linux/anon_inodes.h>
16
17static LIST_HEAD(bpf_map_types);
18
19static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
20{
21 struct bpf_map_type_list *tl;
22 struct bpf_map *map;
23
24 list_for_each_entry(tl, &bpf_map_types, list_node) {
25 if (tl->type == attr->map_type) {
26 map = tl->ops->map_alloc(attr);
27 if (IS_ERR(map))
28 return map;
29 map->ops = tl->ops;
30 map->map_type = attr->map_type;
31 return map;
32 }
33 }
34 return ERR_PTR(-EINVAL);
35}
36
37/* boot time registration of different map implementations */
38void bpf_register_map_type(struct bpf_map_type_list *tl)
39{
40 list_add(&tl->list_node, &bpf_map_types);
41}
42
43/* called from workqueue */
44static void bpf_map_free_deferred(struct work_struct *work)
45{
46 struct bpf_map *map = container_of(work, struct bpf_map, work);
47
48 /* implementation dependent freeing */
49 map->ops->map_free(map);
50}
51
52/* decrement map refcnt and schedule it for freeing via workqueue
53 * (unrelying map implementation ops->map_free() might sleep)
54 */
55void bpf_map_put(struct bpf_map *map)
56{
57 if (atomic_dec_and_test(&map->refcnt)) {
58 INIT_WORK(&map->work, bpf_map_free_deferred);
59 schedule_work(&map->work);
60 }
61}
62
63static int bpf_map_release(struct inode *inode, struct file *filp)
64{
65 struct bpf_map *map = filp->private_data;
66
67 bpf_map_put(map);
68 return 0;
69}
70
71static const struct file_operations bpf_map_fops = {
72 .release = bpf_map_release,
73};
74
75/* helper macro to check that unused fields 'union bpf_attr' are zero */
76#define CHECK_ATTR(CMD) \
77 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
78 sizeof(attr->CMD##_LAST_FIELD), 0, \
79 sizeof(*attr) - \
80 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
81 sizeof(attr->CMD##_LAST_FIELD)) != NULL
82
83#define BPF_MAP_CREATE_LAST_FIELD max_entries
84/* called via syscall */
85static int map_create(union bpf_attr *attr)
86{
87 struct bpf_map *map;
88 int err;
89
90 err = CHECK_ATTR(BPF_MAP_CREATE);
91 if (err)
92 return -EINVAL;
93
94 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
95 map = find_and_alloc_map(attr);
96 if (IS_ERR(map))
97 return PTR_ERR(map);
98
99 atomic_set(&map->refcnt, 1);
100
101 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
102
103 if (err < 0)
104 /* failed to allocate fd */
105 goto free_map;
106
107 return err;
108
109free_map:
110 map->ops->map_free(map);
111 return err;
112}
113
114SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
115{
116 union bpf_attr attr = {};
117 int err;
118
119 /* the syscall is limited to root temporarily. This restriction will be
120 * lifted when security audit is clean. Note that eBPF+tracing must have
121 * this restriction, since it may pass kernel data to user space
122 */
123 if (!capable(CAP_SYS_ADMIN))
124 return -EPERM;
125
126 if (!access_ok(VERIFY_READ, uattr, 1))
127 return -EFAULT;
128
129 if (size > PAGE_SIZE) /* silly large */
130 return -E2BIG;
131
132 /* If we're handed a bigger struct than we know of,
133 * ensure all the unknown bits are 0 - i.e. new
134 * user-space does not rely on any kernel feature
135 * extensions we dont know about yet.
136 */
137 if (size > sizeof(attr)) {
138 unsigned char __user *addr;
139 unsigned char __user *end;
140 unsigned char val;
141
142 addr = (void __user *)uattr + sizeof(attr);
143 end = (void __user *)uattr + size;
144
145 for (; addr < end; addr++) {
146 err = get_user(val, addr);
147 if (err)
148 return err;
149 if (val)
150 return -E2BIG;
151 }
152 size = sizeof(attr);
153 }
154
155 /* copy attributes from user space, may be less than sizeof(bpf_attr) */
156 if (copy_from_user(&attr, uattr, size) != 0)
157 return -EFAULT;
158
159 switch (cmd) {
160 case BPF_MAP_CREATE:
161 err = map_create(&attr);
162 break;
163 default:
164 err = -EINVAL;
165 break;
166 }
167
168 return err;
169}