aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2015-01-14 07:18:11 -0500
committerIngo Molnar <mingo@kernel.org>2015-04-02 11:13:46 -0400
commit45bfb2e50471abbbfd83d40d28c986078b0d24ff (patch)
treef06f2176a2ef51315387f492a3f1b85efe91f2bb /kernel/events
parente8c6deac69629c0cb97c3d3272f8631ef17f8f0f (diff)
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for exporting high bandwidth data streams to userspace, such as instruction flow traces. AUX space is a ring buffer, defined by aux_{offset,size} fields in the user_page structure, and read/write pointers aux_{head,tail}, which abide by the same rules as data_* counterparts of the main perf buffer. In order to allocate/mmap AUX, userspace needs to set up aux_offset to such an offset that will be greater than data_offset+data_size and aux_size to be the desired buffer size. Both need to be page aligned. Then, same aux_offset and aux_size should be passed to mmap() call and if everything adds up, you should have an AUX buffer as a result. Pages that are mapped into this buffer also come out of user's mlock rlimit plus perf_event_mlock_kb allowance. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kaixu Xia <kaixu.xia@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Robert Richter <rric@kernel.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c141
-rw-r--r--kernel/events/internal.h23
-rw-r--r--kernel/events/ring_buffer.c97
3 files changed, 232 insertions, 29 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6efa516f1ab8..da51128c337a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4306,6 +4306,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
4306 atomic_inc(&event->mmap_count); 4306 atomic_inc(&event->mmap_count);
4307 atomic_inc(&event->rb->mmap_count); 4307 atomic_inc(&event->rb->mmap_count);
4308 4308
4309 if (vma->vm_pgoff)
4310 atomic_inc(&event->rb->aux_mmap_count);
4311
4309 if (event->pmu->event_mapped) 4312 if (event->pmu->event_mapped)
4310 event->pmu->event_mapped(event); 4313 event->pmu->event_mapped(event);
4311} 4314}
@@ -4330,6 +4333,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
4330 if (event->pmu->event_unmapped) 4333 if (event->pmu->event_unmapped)
4331 event->pmu->event_unmapped(event); 4334 event->pmu->event_unmapped(event);
4332 4335
4336 /*
4337 * rb->aux_mmap_count will always drop before rb->mmap_count and
4338 * event->mmap_count, so it is ok to use event->mmap_mutex to
4339 * serialize with perf_mmap here.
4340 */
4341 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4342 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4343 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4344 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4345
4346 rb_free_aux(rb);
4347 mutex_unlock(&event->mmap_mutex);
4348 }
4349
4333 atomic_dec(&rb->mmap_count); 4350 atomic_dec(&rb->mmap_count);
4334 4351
4335 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4352 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4403,7 +4420,7 @@ out_put:
4403 4420
4404static const struct vm_operations_struct perf_mmap_vmops = { 4421static const struct vm_operations_struct perf_mmap_vmops = {
4405 .open = perf_mmap_open, 4422 .open = perf_mmap_open,
4406 .close = perf_mmap_close, 4423 .close = perf_mmap_close, /* non mergable */
4407 .fault = perf_mmap_fault, 4424 .fault = perf_mmap_fault,
4408 .page_mkwrite = perf_mmap_fault, 4425 .page_mkwrite = perf_mmap_fault,
4409}; 4426};
@@ -4414,10 +4431,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4414 unsigned long user_locked, user_lock_limit; 4431 unsigned long user_locked, user_lock_limit;
4415 struct user_struct *user = current_user(); 4432 struct user_struct *user = current_user();
4416 unsigned long locked, lock_limit; 4433 unsigned long locked, lock_limit;
4417 struct ring_buffer *rb; 4434 struct ring_buffer *rb = NULL;
4418 unsigned long vma_size; 4435 unsigned long vma_size;
4419 unsigned long nr_pages; 4436 unsigned long nr_pages;
4420 long user_extra, extra; 4437 long user_extra = 0, extra = 0;
4421 int ret = 0, flags = 0; 4438 int ret = 0, flags = 0;
4422 4439
4423 /* 4440 /*
@@ -4432,7 +4449,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4432 return -EINVAL; 4449 return -EINVAL;
4433 4450
4434 vma_size = vma->vm_end - vma->vm_start; 4451 vma_size = vma->vm_end - vma->vm_start;
4435 nr_pages = (vma_size / PAGE_SIZE) - 1; 4452
4453 if (vma->vm_pgoff == 0) {
4454 nr_pages = (vma_size / PAGE_SIZE) - 1;
4455 } else {
4456 /*
4457 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4458 * mapped, all subsequent mappings should have the same size
4459 * and offset. Must be above the normal perf buffer.
4460 */
4461 u64 aux_offset, aux_size;
4462
4463 if (!event->rb)
4464 return -EINVAL;
4465
4466 nr_pages = vma_size / PAGE_SIZE;
4467
4468 mutex_lock(&event->mmap_mutex);
4469 ret = -EINVAL;
4470
4471 rb = event->rb;
4472 if (!rb)
4473 goto aux_unlock;
4474
4475 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4476 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4477
4478 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4479 goto aux_unlock;
4480
4481 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4482 goto aux_unlock;
4483
4484 /* already mapped with a different offset */
4485 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4486 goto aux_unlock;
4487
4488 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4489 goto aux_unlock;
4490
4491 /* already mapped with a different size */
4492 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4493 goto aux_unlock;
4494
4495 if (!is_power_of_2(nr_pages))
4496 goto aux_unlock;
4497
4498 if (!atomic_inc_not_zero(&rb->mmap_count))
4499 goto aux_unlock;
4500
4501 if (rb_has_aux(rb)) {
4502 atomic_inc(&rb->aux_mmap_count);
4503 ret = 0;
4504 goto unlock;
4505 }
4506
4507 atomic_set(&rb->aux_mmap_count, 1);
4508 user_extra = nr_pages;
4509
4510 goto accounting;
4511 }
4436 4512
4437 /* 4513 /*
4438 * If we have rb pages ensure they're a power-of-two number, so we 4514 * If we have rb pages ensure they're a power-of-two number, so we
@@ -4444,9 +4520,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4444 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 4520 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4445 return -EINVAL; 4521 return -EINVAL;
4446 4522
4447 if (vma->vm_pgoff != 0)
4448 return -EINVAL;
4449
4450 WARN_ON_ONCE(event->ctx->parent_ctx); 4523 WARN_ON_ONCE(event->ctx->parent_ctx);
4451again: 4524again:
4452 mutex_lock(&event->mmap_mutex); 4525 mutex_lock(&event->mmap_mutex);
@@ -4470,6 +4543,8 @@ again:
4470 } 4543 }
4471 4544
4472 user_extra = nr_pages + 1; 4545 user_extra = nr_pages + 1;
4546
4547accounting:
4473 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 4548 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4474 4549
4475 /* 4550 /*
@@ -4479,7 +4554,6 @@ again:
4479 4554
4480 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 4555 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4481 4556
4482 extra = 0;
4483 if (user_locked > user_lock_limit) 4557 if (user_locked > user_lock_limit)
4484 extra = user_locked - user_lock_limit; 4558 extra = user_locked - user_lock_limit;
4485 4559
@@ -4493,35 +4567,45 @@ again:
4493 goto unlock; 4567 goto unlock;
4494 } 4568 }
4495 4569
4496 WARN_ON(event->rb); 4570 WARN_ON(!rb && event->rb);
4497 4571
4498 if (vma->vm_flags & VM_WRITE) 4572 if (vma->vm_flags & VM_WRITE)
4499 flags |= RING_BUFFER_WRITABLE; 4573 flags |= RING_BUFFER_WRITABLE;
4500 4574
4501 rb = rb_alloc(nr_pages,
4502 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4503 event->cpu, flags);
4504
4505 if (!rb) { 4575 if (!rb) {
4506 ret = -ENOMEM; 4576 rb = rb_alloc(nr_pages,
4507 goto unlock; 4577 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4508 } 4578 event->cpu, flags);
4509 4579
4510 atomic_set(&rb->mmap_count, 1); 4580 if (!rb) {
4511 rb->mmap_locked = extra; 4581 ret = -ENOMEM;
4512 rb->mmap_user = get_current_user(); 4582 goto unlock;
4583 }
4513 4584
4514 atomic_long_add(user_extra, &user->locked_vm); 4585 atomic_set(&rb->mmap_count, 1);
4515 vma->vm_mm->pinned_vm += extra; 4586 rb->mmap_user = get_current_user();
4587 rb->mmap_locked = extra;
4516 4588
4517 ring_buffer_attach(event, rb); 4589 ring_buffer_attach(event, rb);
4518 4590
4519 perf_event_init_userpage(event); 4591 perf_event_init_userpage(event);
4520 perf_event_update_userpage(event); 4592 perf_event_update_userpage(event);
4593 } else {
4594 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags);
4595 if (!ret)
4596 rb->aux_mmap_locked = extra;
4597 }
4521 4598
4522unlock: 4599unlock:
4523 if (!ret) 4600 if (!ret) {
4601 atomic_long_add(user_extra, &user->locked_vm);
4602 vma->vm_mm->pinned_vm += extra;
4603
4524 atomic_inc(&event->mmap_count); 4604 atomic_inc(&event->mmap_count);
4605 } else if (rb) {
4606 atomic_dec(&rb->mmap_count);
4607 }
4608aux_unlock:
4525 mutex_unlock(&event->mmap_mutex); 4609 mutex_unlock(&event->mmap_mutex);
4526 4610
4527 /* 4611 /*
@@ -7506,6 +7590,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7506 if (output_event->clock != event->clock) 7590 if (output_event->clock != event->clock)
7507 goto out; 7591 goto out;
7508 7592
7593 /*
7594 * If both events generate aux data, they must be on the same PMU
7595 */
7596 if (has_aux(event) && has_aux(output_event) &&
7597 event->pmu != output_event->pmu)
7598 goto out;
7599
7509set: 7600set:
7510 mutex_lock(&event->mmap_mutex); 7601 mutex_lock(&event->mmap_mutex);
7511 /* Can't redirect output if we've got an active mmap() */ 7602 /* Can't redirect output if we've got an active mmap() */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..0f6d08015927 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,6 +35,16 @@ struct ring_buffer {
35 unsigned long mmap_locked; 35 unsigned long mmap_locked;
36 struct user_struct *mmap_user; 36 struct user_struct *mmap_user;
37 37
38 /* AUX area */
39 unsigned long aux_pgoff;
40 int aux_nr_pages;
41 atomic_t aux_mmap_count;
42 unsigned long aux_mmap_locked;
43 void (*free_aux)(void *);
44 atomic_t aux_refcount;
45 void **aux_pages;
46 void *aux_priv;
47
38 struct perf_event_mmap_page *user_page; 48 struct perf_event_mmap_page *user_page;
39 void *data_pages[0]; 49 void *data_pages[0];
40}; 50};
@@ -43,6 +53,14 @@ extern void rb_free(struct ring_buffer *rb);
43extern struct ring_buffer * 53extern struct ring_buffer *
44rb_alloc(int nr_pages, long watermark, int cpu, int flags); 54rb_alloc(int nr_pages, long watermark, int cpu, int flags);
45extern void perf_event_wakeup(struct perf_event *event); 55extern void perf_event_wakeup(struct perf_event *event);
56extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
57 pgoff_t pgoff, int nr_pages, int flags);
58extern void rb_free_aux(struct ring_buffer *rb);
59
60static inline bool rb_has_aux(struct ring_buffer *rb)
61{
62 return !!rb->aux_nr_pages;
63}
46 64
47extern void 65extern void
48perf_event_header__init_id(struct perf_event_header *header, 66perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +99,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
81 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 99 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
82} 100}
83 101
102static inline unsigned long perf_aux_size(struct ring_buffer *rb)
103{
104 return rb->aux_nr_pages << PAGE_SHIFT;
105}
106
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 107#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned long \ 108static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 109func_name(struct perf_output_handle *handle, \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..3de9c4e9ea9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,87 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
243 spin_lock_init(&rb->event_lock); 243 spin_lock_init(&rb->event_lock);
244} 244}
245 245
246int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
247 pgoff_t pgoff, int nr_pages, int flags)
248{
249 bool overwrite = !(flags & RING_BUFFER_WRITABLE);
250 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
251 int ret = -ENOMEM;
252
253 if (!has_aux(event))
254 return -ENOTSUPP;
255
256 rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
257 if (!rb->aux_pages)
258 return -ENOMEM;
259
260 rb->free_aux = event->pmu->free_aux;
261 for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;
262 rb->aux_nr_pages++) {
263 struct page *page;
264
265 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
266 if (!page)
267 goto out;
268
269 rb->aux_pages[rb->aux_nr_pages] = page_address(page);
270 }
271
272 rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
273 overwrite);
274 if (!rb->aux_priv)
275 goto out;
276
277 ret = 0;
278
279 /*
280 * aux_pages (and pmu driver's private data, aux_priv) will be
281 * referenced in both producer's and consumer's contexts, thus
282 * we keep a refcount here to make sure either of the two can
283 * reference them safely.
284 */
285 atomic_set(&rb->aux_refcount, 1);
286
287out:
288 if (!ret)
289 rb->aux_pgoff = pgoff;
290 else
291 rb_free_aux(rb);
292
293 return ret;
294}
295
296static void __rb_free_aux(struct ring_buffer *rb)
297{
298 int pg;
299
300 if (rb->aux_priv) {
301 rb->free_aux(rb->aux_priv);
302 rb->free_aux = NULL;
303 rb->aux_priv = NULL;
304 }
305
306 for (pg = 0; pg < rb->aux_nr_pages; pg++)
307 free_page((unsigned long)rb->aux_pages[pg]);
308
309 kfree(rb->aux_pages);
310 rb->aux_nr_pages = 0;
311}
312
313void rb_free_aux(struct ring_buffer *rb)
314{
315 if (atomic_dec_and_test(&rb->aux_refcount))
316 __rb_free_aux(rb);
317}
318
246#ifndef CONFIG_PERF_USE_VMALLOC 319#ifndef CONFIG_PERF_USE_VMALLOC
247 320
248/* 321/*
249 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 322 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
250 */ 323 */
251 324
252struct page * 325static struct page *
253perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 326__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
254{ 327{
255 if (pgoff > rb->nr_pages) 328 if (pgoff > rb->nr_pages)
256 return NULL; 329 return NULL;
@@ -340,8 +413,8 @@ static int data_page_nr(struct ring_buffer *rb)
340 return rb->nr_pages << page_order(rb); 413 return rb->nr_pages << page_order(rb);
341} 414}
342 415
343struct page * 416static struct page *
344perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 417__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
345{ 418{
346 /* The '>' counts in the user page. */ 419 /* The '>' counts in the user page. */
347 if (pgoff > data_page_nr(rb)) 420 if (pgoff > data_page_nr(rb))
@@ -416,3 +489,19 @@ fail:
416} 489}
417 490
418#endif 491#endif
492
493struct page *
494perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
495{
496 if (rb->aux_nr_pages) {
497 /* above AUX space */
498 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
499 return NULL;
500
501 /* AUX space */
502 if (pgoff >= rb->aux_pgoff)
503 return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
504 }
505
506 return __perf_mmap_to_page(rb, pgoff);
507}