diff options
| author | Martin KaFai Lau <kafai@fb.com> | 2017-03-22 13:00:34 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2017-03-22 18:45:45 -0400 |
| commit | bcc6b1b7ebf857a9fe56202e2be3361131588c15 (patch) | |
| tree | bc23c28ffe4866226ca8d768f67e74ddd9721076 | |
| parent | 56f668dfe00dcf086734f1c42ea999398fad6572 (diff) | |
bpf: Add hash of maps support
This patch adds hash of maps support (hashmap->bpf_map).
BPF_MAP_TYPE_HASH_OF_MAPS is added.
A map-in-map contains a pointer to another map and lets call
this pointer 'inner_map_ptr'.
Notes on deleting inner_map_ptr from a hash map:
1. For BPF_F_NO_PREALLOC map-in-map, when deleting
an inner_map_ptr, the htab_elem itself will go through
a rcu grace period and the inner_map_ptr resides
in the htab_elem.
2. For pre-allocated htab_elem (!BPF_F_NO_PREALLOC),
when deleting an inner_map_ptr, the htab_elem may
get reused immediately. This situation is similar
to the existing prealloc-ated use cases.
However, the bpf_map_fd_put_ptr() calls bpf_map_put() which calls
inner_map->ops->map_free(inner_map) which will go
through a rcu grace period (i.e. all bpf_map's map_free
currently goes through a rcu grace period). Hence,
the inner_map_ptr is still safe for the rcu reader side.
This patch also includes BPF_MAP_TYPE_HASH_OF_MAPS to the
check_map_prealloc() in the verifier. preallocation is a
must for BPF_PROG_TYPE_PERF_EVENT. Hence, even we don't expect
heavy updates to map-in-map, enforcing BPF_F_NO_PREALLOC for map-in-map
is impossible without disallowing BPF_PROG_TYPE_PERF_EVENT from using
map-in-map first.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/linux/bpf.h | 2 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 1 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 121 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 4 |
5 files changed, 134 insertions, 2 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f3cdf9b15e8..2ae39a3e9ead 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h | |||
| @@ -277,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); | |||
| 277 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, | 277 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 278 | void *key, void *value, u64 map_flags); | 278 | void *key, void *value, u64 map_flags); |
| 279 | void bpf_fd_array_map_clear(struct bpf_map *map); | 279 | void bpf_fd_array_map_clear(struct bpf_map *map); |
| 280 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | ||
| 281 | void *key, void *value, u64 map_flags); | ||
| 280 | 282 | ||
| 281 | /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and | 283 | /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and |
| 282 | * forced to use 'long' read/writes to try to atomically copy long counters. | 284 | * forced to use 'long' read/writes to try to atomically copy long counters. |
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1701ec1e7de3..ce6f029ac368 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
| @@ -97,6 +97,7 @@ enum bpf_map_type { | |||
| 97 | BPF_MAP_TYPE_LRU_PERCPU_HASH, | 97 | BPF_MAP_TYPE_LRU_PERCPU_HASH, |
| 98 | BPF_MAP_TYPE_LPM_TRIE, | 98 | BPF_MAP_TYPE_LPM_TRIE, |
| 99 | BPF_MAP_TYPE_ARRAY_OF_MAPS, | 99 | BPF_MAP_TYPE_ARRAY_OF_MAPS, |
| 100 | BPF_MAP_TYPE_HASH_OF_MAPS, | ||
| 100 | }; | 101 | }; |
| 101 | 102 | ||
| 102 | enum bpf_prog_type { | 103 | enum bpf_prog_type { |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 000153acb6d5..343fb5394c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/rculist_nulls.h> | 16 | #include <linux/rculist_nulls.h> |
| 17 | #include "percpu_freelist.h" | 17 | #include "percpu_freelist.h" |
| 18 | #include "bpf_lru_list.h" | 18 | #include "bpf_lru_list.h" |
| 19 | #include "map_in_map.h" | ||
| 19 | 20 | ||
| 20 | struct bucket { | 21 | struct bucket { |
| 21 | struct hlist_nulls_head head; | 22 | struct hlist_nulls_head head; |
| @@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size | |||
| 88 | return *(void __percpu **)(l->key + key_size); | 89 | return *(void __percpu **)(l->key + key_size); |
| 89 | } | 90 | } |
| 90 | 91 | ||
| 92 | static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) | ||
| 93 | { | ||
| 94 | return *(void **)(l->key + roundup(map->key_size, 8)); | ||
| 95 | } | ||
| 96 | |||
| 91 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) | 97 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) |
| 92 | { | 98 | { |
| 93 | return (struct htab_elem *) (htab->elems + i * htab->elem_size); | 99 | return (struct htab_elem *) (htab->elems + i * htab->elem_size); |
| @@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) | |||
| 603 | 609 | ||
| 604 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | 610 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
| 605 | { | 611 | { |
| 612 | struct bpf_map *map = &htab->map; | ||
| 613 | |||
| 614 | if (map->ops->map_fd_put_ptr) { | ||
| 615 | void *ptr = fd_htab_map_get_ptr(map, l); | ||
| 616 | |||
| 617 | map->ops->map_fd_put_ptr(ptr); | ||
| 618 | } | ||
| 619 | |||
| 606 | if (l->state == HTAB_EXTRA_ELEM_USED) { | 620 | if (l->state == HTAB_EXTRA_ELEM_USED) { |
| 607 | l->state = HTAB_EXTRA_ELEM_FREE; | 621 | l->state = HTAB_EXTRA_ELEM_FREE; |
| 608 | return; | 622 | return; |
| @@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab) | |||
| 1057 | } | 1071 | } |
| 1058 | } | 1072 | } |
| 1059 | } | 1073 | } |
| 1074 | |||
| 1060 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | 1075 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ |
| 1061 | static void htab_map_free(struct bpf_map *map) | 1076 | static void htab_map_free(struct bpf_map *map) |
| 1062 | { | 1077 | { |
| @@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { | |||
| 1213 | .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, | 1228 | .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, |
| 1214 | }; | 1229 | }; |
| 1215 | 1230 | ||
| 1231 | static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) | ||
| 1232 | { | ||
| 1233 | struct bpf_map *map; | ||
| 1234 | |||
| 1235 | if (attr->value_size != sizeof(u32)) | ||
| 1236 | return ERR_PTR(-EINVAL); | ||
| 1237 | |||
| 1238 | /* pointer is stored internally */ | ||
| 1239 | attr->value_size = sizeof(void *); | ||
| 1240 | map = htab_map_alloc(attr); | ||
| 1241 | attr->value_size = sizeof(u32); | ||
| 1242 | |||
| 1243 | return map; | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | static void fd_htab_map_free(struct bpf_map *map) | ||
| 1247 | { | ||
| 1248 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 1249 | struct hlist_nulls_node *n; | ||
| 1250 | struct hlist_nulls_head *head; | ||
| 1251 | struct htab_elem *l; | ||
| 1252 | int i; | ||
| 1253 | |||
| 1254 | for (i = 0; i < htab->n_buckets; i++) { | ||
| 1255 | head = select_bucket(htab, i); | ||
| 1256 | |||
| 1257 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { | ||
| 1258 | void *ptr = fd_htab_map_get_ptr(map, l); | ||
| 1259 | |||
| 1260 | map->ops->map_fd_put_ptr(ptr); | ||
| 1261 | } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | htab_map_free(map); | ||
| 1265 | } | ||
| 1266 | |||
| 1267 | /* only called from syscall */ | ||
| 1268 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | ||
| 1269 | void *key, void *value, u64 map_flags) | ||
| 1270 | { | ||
| 1271 | void *ptr; | ||
| 1272 | int ret; | ||
| 1273 | u32 ufd = *(u32 *)value; | ||
| 1274 | |||
| 1275 | ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); | ||
| 1276 | if (IS_ERR(ptr)) | ||
| 1277 | return PTR_ERR(ptr); | ||
| 1278 | |||
| 1279 | ret = htab_map_update_elem(map, key, &ptr, map_flags); | ||
| 1280 | if (ret) | ||
| 1281 | map->ops->map_fd_put_ptr(ptr); | ||
| 1282 | |||
| 1283 | return ret; | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) | ||
| 1287 | { | ||
| 1288 | struct bpf_map *map, *inner_map_meta; | ||
| 1289 | |||
| 1290 | inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); | ||
| 1291 | if (IS_ERR(inner_map_meta)) | ||
| 1292 | return inner_map_meta; | ||
| 1293 | |||
| 1294 | map = fd_htab_map_alloc(attr); | ||
| 1295 | if (IS_ERR(map)) { | ||
| 1296 | bpf_map_meta_free(inner_map_meta); | ||
| 1297 | return map; | ||
| 1298 | } | ||
| 1299 | |||
| 1300 | map->inner_map_meta = inner_map_meta; | ||
| 1301 | |||
| 1302 | return map; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 1306 | { | ||
| 1307 | struct bpf_map **inner_map = htab_map_lookup_elem(map, key); | ||
| 1308 | |||
| 1309 | if (!inner_map) | ||
| 1310 | return NULL; | ||
| 1311 | |||
| 1312 | return READ_ONCE(*inner_map); | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | static void htab_of_map_free(struct bpf_map *map) | ||
| 1316 | { | ||
| 1317 | bpf_map_meta_free(map->inner_map_meta); | ||
| 1318 | fd_htab_map_free(map); | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | static const struct bpf_map_ops htab_of_map_ops = { | ||
| 1322 | .map_alloc = htab_of_map_alloc, | ||
| 1323 | .map_free = htab_of_map_free, | ||
| 1324 | .map_get_next_key = htab_map_get_next_key, | ||
| 1325 | .map_lookup_elem = htab_of_map_lookup_elem, | ||
| 1326 | .map_delete_elem = htab_map_delete_elem, | ||
| 1327 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | ||
| 1328 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | ||
| 1329 | }; | ||
| 1330 | |||
| 1331 | static struct bpf_map_type_list htab_of_map_type __ro_after_init = { | ||
| 1332 | .ops = &htab_of_map_ops, | ||
| 1333 | .type = BPF_MAP_TYPE_HASH_OF_MAPS, | ||
| 1334 | }; | ||
| 1335 | |||
| 1216 | static int __init register_htab_map(void) | 1336 | static int __init register_htab_map(void) |
| 1217 | { | 1337 | { |
| 1218 | bpf_register_map_type(&htab_type); | 1338 | bpf_register_map_type(&htab_type); |
| 1219 | bpf_register_map_type(&htab_percpu_type); | 1339 | bpf_register_map_type(&htab_percpu_type); |
| 1220 | bpf_register_map_type(&htab_lru_type); | 1340 | bpf_register_map_type(&htab_lru_type); |
| 1221 | bpf_register_map_type(&htab_lru_percpu_type); | 1341 | bpf_register_map_type(&htab_lru_percpu_type); |
| 1342 | bpf_register_map_type(&htab_of_map_type); | ||
| 1222 | return 0; | 1343 | return 0; |
| 1223 | } | 1344 | } |
| 1224 | late_initcall(register_htab_map); | 1345 | late_initcall(register_htab_map); |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6e24fdf1f373..c35ebfe6d84d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -352,7 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 352 | err = bpf_percpu_array_copy(map, key, value); | 352 | err = bpf_percpu_array_copy(map, key, value); |
| 353 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { | 353 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { |
| 354 | err = bpf_stackmap_copy(map, key, value); | 354 | err = bpf_stackmap_copy(map, key, value); |
| 355 | } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { | 355 | } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || |
| 356 | map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | ||
| 356 | err = -ENOTSUPP; | 357 | err = -ENOTSUPP; |
| 357 | } else { | 358 | } else { |
| 358 | rcu_read_lock(); | 359 | rcu_read_lock(); |
| @@ -446,6 +447,11 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 446 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, | 447 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, |
| 447 | attr->flags); | 448 | attr->flags); |
| 448 | rcu_read_unlock(); | 449 | rcu_read_unlock(); |
| 450 | } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | ||
| 451 | rcu_read_lock(); | ||
| 452 | err = bpf_fd_htab_map_update_elem(map, f.file, key, value, | ||
| 453 | attr->flags); | ||
| 454 | rcu_read_unlock(); | ||
| 449 | } else { | 455 | } else { |
| 450 | rcu_read_lock(); | 456 | rcu_read_lock(); |
| 451 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 457 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b8f528c5473..09923cc5c7c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -1200,6 +1200,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
| 1200 | goto error; | 1200 | goto error; |
| 1201 | break; | 1201 | break; |
| 1202 | case BPF_MAP_TYPE_ARRAY_OF_MAPS: | 1202 | case BPF_MAP_TYPE_ARRAY_OF_MAPS: |
| 1203 | case BPF_MAP_TYPE_HASH_OF_MAPS: | ||
| 1203 | if (func_id != BPF_FUNC_map_lookup_elem) | 1204 | if (func_id != BPF_FUNC_map_lookup_elem) |
| 1204 | goto error; | 1205 | goto error; |
| 1205 | default: | 1206 | default: |
| @@ -3044,7 +3045,8 @@ process_bpf_exit: | |||
| 3044 | static int check_map_prealloc(struct bpf_map *map) | 3045 | static int check_map_prealloc(struct bpf_map *map) |
| 3045 | { | 3046 | { |
| 3046 | return (map->map_type != BPF_MAP_TYPE_HASH && | 3047 | return (map->map_type != BPF_MAP_TYPE_HASH && |
| 3047 | map->map_type != BPF_MAP_TYPE_PERCPU_HASH) || | 3048 | map->map_type != BPF_MAP_TYPE_PERCPU_HASH && |
| 3049 | map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || | ||
| 3048 | !(map->map_flags & BPF_F_NO_PREALLOC); | 3050 | !(map->map_flags & BPF_F_NO_PREALLOC); |
| 3049 | } | 3051 | } |
| 3050 | 3052 | ||
