aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2016-06-15 16:47:14 -0400
committerDavid S. Miller <davem@davemloft.net>2016-06-16 02:42:57 -0400
commit3b1efb196eee45b2f0c4994e0c43edb5e367f620 (patch)
treeb4f7d122f21e841f0057c624e064f8ca30622e48 /kernel/bpf
parentd056a788765e67773124f520159185bc89f5d1ad (diff)
bpf, maps: flush own entries on perf map release
The behavior of perf event arrays are quite different from all others as they are tightly coupled to perf event fds, f.e. shown recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array to use struct file") to make refcounting on perf event more robust. A remaining issue that the current code still has is that since additions to the perf event array take a reference on the struct file via perf_event_get() and are only released via fput() (that cleans up the perf event eventually via perf_event_release_kernel()) when the element is either manually removed from the map from user space or automatically when the last reference on the perf event map is dropped. However, this leads us to dangling struct file's when the map gets pinned after the application owning the perf event descriptor exits, and since the struct file reference will in such case only be manually dropped or via pinned file removal, it leads to the perf event living longer than necessary, consuming needlessly resources for that time. Relations between perf event fds and bpf perf event map fds can be rather complex. F.e. maps can act as demuxers among different perf event fds that can possibly be owned by different threads and based on the index selection from the program, events get dispatched to one of the per-cpu fd endpoints. One perf event fd (or, rather a per-cpu set of them) can also live in multiple perf event maps at the same time, listening for events. Also, another requirement is that perf event fds can get closed from application side after they have been attached to the perf event map, so that on exit perf event map will take care of dropping their references eventually. Likewise, when such maps are pinned, the intended behavior is that a user application does bpf_obj_get(), puts its fds in there and on exit when fd is released, they are dropped from the map again, so the map acts rather as connector endpoint. This also makes perf event maps inherently different from program arrays as described in more detail in commit c9da161c6517 ("bpf: fix clearing on persistent program array maps"). To tackle this, map entries are marked by the map struct file that added the element to the map. And when the last reference to that map struct file is released from user space, then the tracked entries are purged from the map. This is okay, because new map struct files instances resp. frontends to the anon inode are provided via bpf_map_new_fd() that is called when we invoke bpf_obj_get_user() for retrieving a pinned map, but also when an initial instance is created via map_create(). The rest is resolved by the vfs layer automatically for us by keeping reference count on the map's struct file. Any concurrent updates on the map slot are fine as well, it just means that perf_event_fd_array_release() needs to delete less of its own entires. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/arraymap.c102
1 files changed, 74 insertions, 28 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bfedcbdb4d84..5af30732697b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -427,59 +427,105 @@ static int __init register_prog_array_map(void)
427} 427}
428late_initcall(register_prog_array_map); 428late_initcall(register_prog_array_map);
429 429
430static void perf_event_array_map_free(struct bpf_map *map) 430static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
431 struct file *map_file)
431{ 432{
432 bpf_fd_array_map_clear(map); 433 struct bpf_event_entry *ee;
433 fd_array_map_free(map); 434
435 ee = kzalloc(sizeof(*ee), GFP_KERNEL);
436 if (ee) {
437 ee->event = perf_file->private_data;
438 ee->perf_file = perf_file;
439 ee->map_file = map_file;
440 }
441
442 return ee;
443}
444
445static void __bpf_event_entry_free(struct rcu_head *rcu)
446{
447 struct bpf_event_entry *ee;
448
449 ee = container_of(rcu, struct bpf_event_entry, rcu);
450 fput(ee->perf_file);
451 kfree(ee);
452}
453
454static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
455{
456 call_rcu(&ee->rcu, __bpf_event_entry_free);
434} 457}
435 458
436static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 459static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
437 struct file *map_file, int fd) 460 struct file *map_file, int fd)
438{ 461{
439 struct perf_event *event;
440 const struct perf_event_attr *attr; 462 const struct perf_event_attr *attr;
441 struct file *file; 463 struct bpf_event_entry *ee;
464 struct perf_event *event;
465 struct file *perf_file;
442 466
443 file = perf_event_get(fd); 467 perf_file = perf_event_get(fd);
444 if (IS_ERR(file)) 468 if (IS_ERR(perf_file))
445 return file; 469 return perf_file;
446 470
447 event = file->private_data; 471 event = perf_file->private_data;
472 ee = ERR_PTR(-EINVAL);
448 473
449 attr = perf_event_attrs(event); 474 attr = perf_event_attrs(event);
450 if (IS_ERR(attr)) 475 if (IS_ERR(attr) || attr->inherit)
451 goto err; 476 goto err_out;
452 477
453 if (attr->inherit) 478 switch (attr->type) {
454 goto err; 479 case PERF_TYPE_SOFTWARE:
455 480 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
456 if (attr->type == PERF_TYPE_RAW) 481 goto err_out;
457 return file; 482 /* fall-through */
458 483 case PERF_TYPE_RAW:
459 if (attr->type == PERF_TYPE_HARDWARE) 484 case PERF_TYPE_HARDWARE:
460 return file; 485 ee = bpf_event_entry_gen(perf_file, map_file);
486 if (ee)
487 return ee;
488 ee = ERR_PTR(-ENOMEM);
489 /* fall-through */
490 default:
491 break;
492 }
461 493
462 if (attr->type == PERF_TYPE_SOFTWARE && 494err_out:
463 attr->config == PERF_COUNT_SW_BPF_OUTPUT) 495 fput(perf_file);
464 return file; 496 return ee;
465err:
466 fput(file);
467 return ERR_PTR(-EINVAL);
468} 497}
469 498
470static void perf_event_fd_array_put_ptr(void *ptr) 499static void perf_event_fd_array_put_ptr(void *ptr)
471{ 500{
472 fput((struct file *)ptr); 501 bpf_event_entry_free_rcu(ptr);
502}
503
504static void perf_event_fd_array_release(struct bpf_map *map,
505 struct file *map_file)
506{
507 struct bpf_array *array = container_of(map, struct bpf_array, map);
508 struct bpf_event_entry *ee;
509 int i;
510
511 rcu_read_lock();
512 for (i = 0; i < array->map.max_entries; i++) {
513 ee = READ_ONCE(array->ptrs[i]);
514 if (ee && ee->map_file == map_file)
515 fd_array_map_delete_elem(map, &i);
516 }
517 rcu_read_unlock();
473} 518}
474 519
475static const struct bpf_map_ops perf_event_array_ops = { 520static const struct bpf_map_ops perf_event_array_ops = {
476 .map_alloc = fd_array_map_alloc, 521 .map_alloc = fd_array_map_alloc,
477 .map_free = perf_event_array_map_free, 522 .map_free = fd_array_map_free,
478 .map_get_next_key = array_map_get_next_key, 523 .map_get_next_key = array_map_get_next_key,
479 .map_lookup_elem = fd_array_map_lookup_elem, 524 .map_lookup_elem = fd_array_map_lookup_elem,
480 .map_delete_elem = fd_array_map_delete_elem, 525 .map_delete_elem = fd_array_map_delete_elem,
481 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 526 .map_fd_get_ptr = perf_event_fd_array_get_ptr,
482 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 527 .map_fd_put_ptr = perf_event_fd_array_put_ptr,
528 .map_release = perf_event_fd_array_release,
483}; 529};
484 530
485static struct bpf_map_type_list perf_event_array_type __read_mostly = { 531static struct bpf_map_type_list perf_event_array_type __read_mostly = {