aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorSteven Rostedt (Red Hat) <rostedt@goodmis.org>2015-02-10 22:14:53 -0500
committerSteven Rostedt <rostedt@goodmis.org>2015-02-11 07:41:42 -0500
commit1e0d6714aceb770b04161fbedd7765d0e1fc27bd (patch)
treec48fa3807450ccb9790b71692eb5cce7c34f61d0 /kernel/trace
parent7215853e985a4bef1a6c14e00e89dfec84f1e457 (diff)
ring-buffer: Do not wake up a splice waiter when page is not full
When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/ring_buffer.c40
1 files changed, 35 insertions, 5 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96079180de3d..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -445,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
445struct rb_irq_work { 445struct rb_irq_work {
446 struct irq_work work; 446 struct irq_work work;
447 wait_queue_head_t waiters; 447 wait_queue_head_t waiters;
448 wait_queue_head_t full_waiters;
448 bool waiters_pending; 449 bool waiters_pending;
450 bool full_waiters_pending;
451 bool wakeup_full;
449}; 452};
450 453
451/* 454/*
@@ -527,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
527 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); 530 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
528 531
529 wake_up_all(&rbwork->waiters); 532 wake_up_all(&rbwork->waiters);
533 if (rbwork->wakeup_full) {
534 rbwork->wakeup_full = false;
535 wake_up_all(&rbwork->full_waiters);
536 }
530} 537}
531 538
532/** 539/**
@@ -551,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
551 * data in any cpu buffer, or a specific buffer, put the 558 * data in any cpu buffer, or a specific buffer, put the
552 * caller on the appropriate wait queue. 559 * caller on the appropriate wait queue.
553 */ 560 */
554 if (cpu == RING_BUFFER_ALL_CPUS) 561 if (cpu == RING_BUFFER_ALL_CPUS) {
555 work = &buffer->irq_work; 562 work = &buffer->irq_work;
556 else { 563 /* Full only makes sense on per cpu reads */
564 full = false;
565 } else {
557 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 566 if (!cpumask_test_cpu(cpu, buffer->cpumask))
558 return -ENODEV; 567 return -ENODEV;
559 cpu_buffer = buffer->buffers[cpu]; 568 cpu_buffer = buffer->buffers[cpu];
@@ -562,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
562 571
563 572
564 while (true) { 573 while (true) {
565 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 574 if (full)
575 prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
576 else
577 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
566 578
567 /* 579 /*
568 * The events can happen in critical sections where 580 * The events can happen in critical sections where
@@ -584,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
584 * that is necessary is that the wake up happens after 596 * that is necessary is that the wake up happens after
585 * a task has been queued. It's OK for spurious wake ups. 597 * a task has been queued. It's OK for spurious wake ups.
586 */ 598 */
587 work->waiters_pending = true; 599 if (full)
600 work->full_waiters_pending = true;
601 else
602 work->waiters_pending = true;
588 603
589 if (signal_pending(current)) { 604 if (signal_pending(current)) {
590 ret = -EINTR; 605 ret = -EINTR;
@@ -613,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
613 schedule(); 628 schedule();
614 } 629 }
615 630
616 finish_wait(&work->waiters, &wait); 631 if (full)
632 finish_wait(&work->full_waiters, &wait);
633 else
634 finish_wait(&work->waiters, &wait);
617 635
618 return ret; 636 return ret;
619} 637}
@@ -1228,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1228 init_completion(&cpu_buffer->update_done); 1246 init_completion(&cpu_buffer->update_done);
1229 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); 1247 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1230 init_waitqueue_head(&cpu_buffer->irq_work.waiters); 1248 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1249 init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
1231 1250
1232 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1251 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1233 GFP_KERNEL, cpu_to_node(cpu)); 1252 GFP_KERNEL, cpu_to_node(cpu));
@@ -2799,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2799static __always_inline void 2818static __always_inline void
2800rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) 2819rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2801{ 2820{
2821 bool pagebusy;
2822
2802 if (buffer->irq_work.waiters_pending) { 2823 if (buffer->irq_work.waiters_pending) {
2803 buffer->irq_work.waiters_pending = false; 2824 buffer->irq_work.waiters_pending = false;
2804 /* irq_work_queue() supplies it's own memory barriers */ 2825 /* irq_work_queue() supplies it's own memory barriers */
@@ -2810,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2810 /* irq_work_queue() supplies it's own memory barriers */ 2831 /* irq_work_queue() supplies it's own memory barriers */
2811 irq_work_queue(&cpu_buffer->irq_work.work); 2832 irq_work_queue(&cpu_buffer->irq_work.work);
2812 } 2833 }
2834
2835 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2836
2837 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2838 cpu_buffer->irq_work.wakeup_full = true;
2839 cpu_buffer->irq_work.full_waiters_pending = false;
2840 /* irq_work_queue() supplies it's own memory barriers */
2841 irq_work_queue(&cpu_buffer->irq_work.work);
2842 }
2813} 2843}
2814 2844
2815/** 2845/**