diff options
| author | Alexei Starovoitov <ast@plumgrid.com> | 2014-09-26 03:16:59 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2014-09-26 15:05:14 -0400 |
| commit | db20fd2b01087bdfbe30bce314a198eefedcc42e (patch) | |
| tree | c43a4e3edb46deae542c8e79d74d24703ffbdcca | |
| parent | 749730ce42a2121e1c88350d69478bff3994b10a (diff) | |
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/linux/bpf.h | 8 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 38 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 235 |
3 files changed, 281 insertions, 0 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #include <uapi/linux/bpf.h> | 10 | #include <uapi/linux/bpf.h> |
| 11 | #include <linux/workqueue.h> | 11 | #include <linux/workqueue.h> |
| 12 | #include <linux/file.h> | ||
| 12 | 13 | ||
| 13 | struct bpf_map; | 14 | struct bpf_map; |
| 14 | 15 | ||
| @@ -17,6 +18,12 @@ struct bpf_map_ops { | |||
| 17 | /* funcs callable from userspace (via syscall) */ | 18 | /* funcs callable from userspace (via syscall) */ |
| 18 | struct bpf_map *(*map_alloc)(union bpf_attr *attr); | 19 | struct bpf_map *(*map_alloc)(union bpf_attr *attr); |
| 19 | void (*map_free)(struct bpf_map *); | 20 | void (*map_free)(struct bpf_map *); |
| 21 | int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); | ||
| 22 | |||
| 23 | /* funcs callable from userspace and from eBPF programs */ | ||
| 24 | void *(*map_lookup_elem)(struct bpf_map *map, void *key); | ||
| 25 | int (*map_update_elem)(struct bpf_map *map, void *key, void *value); | ||
| 26 | int (*map_delete_elem)(struct bpf_map *map, void *key); | ||
| 20 | }; | 27 | }; |
| 21 | 28 | ||
| 22 | struct bpf_map { | 29 | struct bpf_map { |
| @@ -37,5 +44,6 @@ struct bpf_map_type_list { | |||
| 37 | 44 | ||
| 38 | void bpf_register_map_type(struct bpf_map_type_list *tl); | 45 | void bpf_register_map_type(struct bpf_map_type_list *tl); |
| 39 | void bpf_map_put(struct bpf_map *map); | 46 | void bpf_map_put(struct bpf_map *map); |
| 47 | struct bpf_map *bpf_map_get(struct fd f); | ||
| 40 | 48 | ||
| 41 | #endif /* _LINUX_BPF_H */ | 49 | #endif /* _LINUX_BPF_H */ |
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
| @@ -70,6 +70,35 @@ enum bpf_cmd { | |||
| 70 | * map is deleted when fd is closed | 70 | * map is deleted when fd is closed |
| 71 | */ | 71 | */ |
| 72 | BPF_MAP_CREATE, | 72 | BPF_MAP_CREATE, |
| 73 | |||
| 74 | /* lookup key in a given map | ||
| 75 | * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) | ||
| 76 | * Using attr->map_fd, attr->key, attr->value | ||
| 77 | * returns zero and stores found elem into value | ||
| 78 | * or negative error | ||
| 79 | */ | ||
| 80 | BPF_MAP_LOOKUP_ELEM, | ||
| 81 | |||
| 82 | /* create or update key/value pair in a given map | ||
| 83 | * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) | ||
| 84 | * Using attr->map_fd, attr->key, attr->value | ||
| 85 | * returns zero or negative error | ||
| 86 | */ | ||
| 87 | BPF_MAP_UPDATE_ELEM, | ||
| 88 | |||
| 89 | /* find and delete elem by key in a given map | ||
| 90 | * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) | ||
| 91 | * Using attr->map_fd, attr->key | ||
| 92 | * returns zero or negative error | ||
| 93 | */ | ||
| 94 | BPF_MAP_DELETE_ELEM, | ||
| 95 | |||
| 96 | /* lookup key in a given map and return next key | ||
| 97 | * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) | ||
| 98 | * Using attr->map_fd, attr->key, attr->next_key | ||
| 99 | * returns zero and stores next key or negative error | ||
| 100 | */ | ||
| 101 | BPF_MAP_GET_NEXT_KEY, | ||
| 73 | }; | 102 | }; |
| 74 | 103 | ||
| 75 | enum bpf_map_type { | 104 | enum bpf_map_type { |
| @@ -83,6 +112,15 @@ union bpf_attr { | |||
| 83 | __u32 value_size; /* size of value in bytes */ | 112 | __u32 value_size; /* size of value in bytes */ |
| 84 | __u32 max_entries; /* max number of entries in a map */ | 113 | __u32 max_entries; /* max number of entries in a map */ |
| 85 | }; | 114 | }; |
| 115 | |||
| 116 | struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ | ||
| 117 | __u32 map_fd; | ||
| 118 | __aligned_u64 key; | ||
| 119 | union { | ||
| 120 | __aligned_u64 value; | ||
| 121 | __aligned_u64 next_key; | ||
| 122 | }; | ||
| 123 | }; | ||
| 86 | } __attribute__((aligned(8))); | 124 | } __attribute__((aligned(8))); |
| 87 | 125 | ||
| 88 | #endif /* _UAPI__LINUX_BPF_H__ */ | 126 | #endif /* _UAPI__LINUX_BPF_H__ */ |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
| 14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| 15 | #include <linux/anon_inodes.h> | 15 | #include <linux/anon_inodes.h> |
| 16 | #include <linux/file.h> | ||
| 16 | 17 | ||
| 17 | static LIST_HEAD(bpf_map_types); | 18 | static LIST_HEAD(bpf_map_types); |
| 18 | 19 | ||
| @@ -111,6 +112,228 @@ free_map: | |||
| 111 | return err; | 112 | return err; |
| 112 | } | 113 | } |
| 113 | 114 | ||
| 115 | /* if error is returned, fd is released. | ||
| 116 | * On success caller should complete fd access with matching fdput() | ||
| 117 | */ | ||
| 118 | struct bpf_map *bpf_map_get(struct fd f) | ||
| 119 | { | ||
| 120 | struct bpf_map *map; | ||
| 121 | |||
| 122 | if (!f.file) | ||
| 123 | return ERR_PTR(-EBADF); | ||
| 124 | |||
| 125 | if (f.file->f_op != &bpf_map_fops) { | ||
| 126 | fdput(f); | ||
| 127 | return ERR_PTR(-EINVAL); | ||
| 128 | } | ||
| 129 | |||
| 130 | map = f.file->private_data; | ||
| 131 | |||
| 132 | return map; | ||
| 133 | } | ||
| 134 | |||
| 135 | /* helper to convert user pointers passed inside __aligned_u64 fields */ | ||
| 136 | static void __user *u64_to_ptr(__u64 val) | ||
| 137 | { | ||
| 138 | return (void __user *) (unsigned long) val; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* last field in 'union bpf_attr' used by this command */ | ||
| 142 | #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value | ||
| 143 | |||
| 144 | static int map_lookup_elem(union bpf_attr *attr) | ||
| 145 | { | ||
| 146 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 147 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 148 | int ufd = attr->map_fd; | ||
| 149 | struct fd f = fdget(ufd); | ||
| 150 | struct bpf_map *map; | ||
| 151 | void *key, *value; | ||
| 152 | int err; | ||
| 153 | |||
| 154 | if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) | ||
| 155 | return -EINVAL; | ||
| 156 | |||
| 157 | map = bpf_map_get(f); | ||
| 158 | if (IS_ERR(map)) | ||
| 159 | return PTR_ERR(map); | ||
| 160 | |||
| 161 | err = -ENOMEM; | ||
| 162 | key = kmalloc(map->key_size, GFP_USER); | ||
| 163 | if (!key) | ||
| 164 | goto err_put; | ||
| 165 | |||
| 166 | err = -EFAULT; | ||
| 167 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 168 | goto free_key; | ||
| 169 | |||
| 170 | err = -ESRCH; | ||
| 171 | rcu_read_lock(); | ||
| 172 | value = map->ops->map_lookup_elem(map, key); | ||
| 173 | if (!value) | ||
| 174 | goto err_unlock; | ||
| 175 | |||
| 176 | err = -EFAULT; | ||
| 177 | if (copy_to_user(uvalue, value, map->value_size) != 0) | ||
| 178 | goto err_unlock; | ||
| 179 | |||
| 180 | err = 0; | ||
| 181 | |||
| 182 | err_unlock: | ||
| 183 | rcu_read_unlock(); | ||
| 184 | free_key: | ||
| 185 | kfree(key); | ||
| 186 | err_put: | ||
| 187 | fdput(f); | ||
| 188 | return err; | ||
| 189 | } | ||
| 190 | |||
| 191 | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value | ||
| 192 | |||
| 193 | static int map_update_elem(union bpf_attr *attr) | ||
| 194 | { | ||
| 195 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 196 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 197 | int ufd = attr->map_fd; | ||
| 198 | struct fd f = fdget(ufd); | ||
| 199 | struct bpf_map *map; | ||
| 200 | void *key, *value; | ||
| 201 | int err; | ||
| 202 | |||
| 203 | if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) | ||
| 204 | return -EINVAL; | ||
| 205 | |||
| 206 | map = bpf_map_get(f); | ||
| 207 | if (IS_ERR(map)) | ||
| 208 | return PTR_ERR(map); | ||
| 209 | |||
| 210 | err = -ENOMEM; | ||
| 211 | key = kmalloc(map->key_size, GFP_USER); | ||
| 212 | if (!key) | ||
| 213 | goto err_put; | ||
| 214 | |||
| 215 | err = -EFAULT; | ||
| 216 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 217 | goto free_key; | ||
| 218 | |||
| 219 | err = -ENOMEM; | ||
| 220 | value = kmalloc(map->value_size, GFP_USER); | ||
| 221 | if (!value) | ||
| 222 | goto free_key; | ||
| 223 | |||
| 224 | err = -EFAULT; | ||
| 225 | if (copy_from_user(value, uvalue, map->value_size) != 0) | ||
| 226 | goto free_value; | ||
| 227 | |||
| 228 | /* eBPF program that use maps are running under rcu_read_lock(), | ||
| 229 | * therefore all map accessors rely on this fact, so do the same here | ||
| 230 | */ | ||
| 231 | rcu_read_lock(); | ||
| 232 | err = map->ops->map_update_elem(map, key, value); | ||
| 233 | rcu_read_unlock(); | ||
| 234 | |||
| 235 | free_value: | ||
| 236 | kfree(value); | ||
| 237 | free_key: | ||
| 238 | kfree(key); | ||
| 239 | err_put: | ||
| 240 | fdput(f); | ||
| 241 | return err; | ||
| 242 | } | ||
| 243 | |||
| 244 | #define BPF_MAP_DELETE_ELEM_LAST_FIELD key | ||
| 245 | |||
| 246 | static int map_delete_elem(union bpf_attr *attr) | ||
| 247 | { | ||
| 248 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 249 | int ufd = attr->map_fd; | ||
| 250 | struct fd f = fdget(ufd); | ||
| 251 | struct bpf_map *map; | ||
| 252 | void *key; | ||
| 253 | int err; | ||
| 254 | |||
| 255 | if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) | ||
| 256 | return -EINVAL; | ||
| 257 | |||
| 258 | map = bpf_map_get(f); | ||
| 259 | if (IS_ERR(map)) | ||
| 260 | return PTR_ERR(map); | ||
| 261 | |||
| 262 | err = -ENOMEM; | ||
| 263 | key = kmalloc(map->key_size, GFP_USER); | ||
| 264 | if (!key) | ||
| 265 | goto err_put; | ||
| 266 | |||
| 267 | err = -EFAULT; | ||
| 268 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 269 | goto free_key; | ||
| 270 | |||
| 271 | rcu_read_lock(); | ||
| 272 | err = map->ops->map_delete_elem(map, key); | ||
| 273 | rcu_read_unlock(); | ||
| 274 | |||
| 275 | free_key: | ||
| 276 | kfree(key); | ||
| 277 | err_put: | ||
| 278 | fdput(f); | ||
| 279 | return err; | ||
| 280 | } | ||
| 281 | |||
| 282 | /* last field in 'union bpf_attr' used by this command */ | ||
| 283 | #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key | ||
| 284 | |||
| 285 | static int map_get_next_key(union bpf_attr *attr) | ||
| 286 | { | ||
| 287 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 288 | void __user *unext_key = u64_to_ptr(attr->next_key); | ||
| 289 | int ufd = attr->map_fd; | ||
| 290 | struct fd f = fdget(ufd); | ||
| 291 | struct bpf_map *map; | ||
| 292 | void *key, *next_key; | ||
| 293 | int err; | ||
| 294 | |||
| 295 | if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) | ||
| 296 | return -EINVAL; | ||
| 297 | |||
| 298 | map = bpf_map_get(f); | ||
| 299 | if (IS_ERR(map)) | ||
| 300 | return PTR_ERR(map); | ||
| 301 | |||
| 302 | err = -ENOMEM; | ||
| 303 | key = kmalloc(map->key_size, GFP_USER); | ||
| 304 | if (!key) | ||
| 305 | goto err_put; | ||
| 306 | |||
| 307 | err = -EFAULT; | ||
| 308 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 309 | goto free_key; | ||
| 310 | |||
| 311 | err = -ENOMEM; | ||
| 312 | next_key = kmalloc(map->key_size, GFP_USER); | ||
| 313 | if (!next_key) | ||
| 314 | goto free_key; | ||
| 315 | |||
| 316 | rcu_read_lock(); | ||
| 317 | err = map->ops->map_get_next_key(map, key, next_key); | ||
| 318 | rcu_read_unlock(); | ||
| 319 | if (err) | ||
| 320 | goto free_next_key; | ||
| 321 | |||
| 322 | err = -EFAULT; | ||
| 323 | if (copy_to_user(unext_key, next_key, map->key_size) != 0) | ||
| 324 | goto free_next_key; | ||
| 325 | |||
| 326 | err = 0; | ||
| 327 | |||
| 328 | free_next_key: | ||
| 329 | kfree(next_key); | ||
| 330 | free_key: | ||
| 331 | kfree(key); | ||
| 332 | err_put: | ||
| 333 | fdput(f); | ||
| 334 | return err; | ||
| 335 | } | ||
| 336 | |||
| 114 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 337 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) |
| 115 | { | 338 | { |
| 116 | union bpf_attr attr = {}; | 339 | union bpf_attr attr = {}; |
| @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 160 | case BPF_MAP_CREATE: | 383 | case BPF_MAP_CREATE: |
| 161 | err = map_create(&attr); | 384 | err = map_create(&attr); |
| 162 | break; | 385 | break; |
| 386 | case BPF_MAP_LOOKUP_ELEM: | ||
| 387 | err = map_lookup_elem(&attr); | ||
| 388 | break; | ||
| 389 | case BPF_MAP_UPDATE_ELEM: | ||
| 390 | err = map_update_elem(&attr); | ||
| 391 | break; | ||
| 392 | case BPF_MAP_DELETE_ELEM: | ||
| 393 | err = map_delete_elem(&attr); | ||
| 394 | break; | ||
| 395 | case BPF_MAP_GET_NEXT_KEY: | ||
| 396 | err = map_get_next_key(&attr); | ||
| 397 | break; | ||
| 163 | default: | 398 | default: |
| 164 | err = -EINVAL; | 399 | err = -EINVAL; |
| 165 | break; | 400 | break; |
