aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-11-25 20:47:31 -0500
committerIngo Molnar <mingo@elte.hu>2011-12-05 03:33:03 -0500
commit10c6db110d0eb4466b59812c49088ab56218fc2e (patch)
treed1d4e8debcf7415df49ce691b4c3da7443919f11 /kernel
parent16e5294e5f8303756a179cf218e37dfb9ed34417 (diff)
perf: Fix loss of notification with multi-event
When you do: $ perf record -e cycles,cycles,cycles noploop 10 You expect about 10,000 samples for each event, i.e., 10s at 1000samples/sec. However, this is not what's happening. You get much fewer samples, maybe 3700 samples/event: $ perf report -D | tail -15 Aggregated stats: TOTAL events: 10998 MMAP events: 66 COMM events: 2 SAMPLE events: 10930 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 cycles stats: TOTAL events: 3642 SAMPLE events: 3642 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 On a Intel Nehalem or even AMD64, there are 4 counters capable of measuring cycles, so there is plenty of space to measure those events without multiplexing (even with the NMI watchdog active). And even with multiplexing, we'd expect roughly the same number of samples per event. The root of the problem was that when the event that caused the buffer to become full was not the first event passed on the cmdline, the user notification would get lost. The notification was sent to the file descriptor of the overflowed event but the perf tool was not polling on it. The perf tool aggregates all samples into a single buffer, i.e., the buffer of the first event. Consequently, it assumes notifications for any event will come via that descriptor. The seemingly straight forward solution of moving the waitq into the ringbuffer object doesn't work because of life-time issues. One could perf_event_set_output() on a fd that you're also blocking on and cause the old rb object to be freed while its waitq would still be referenced by the blocked thread -> FAIL. Therefore link all events to the ringbuffer and broadcast the wakeup from the ringbuffer object to all possible events that could be waited upon. This is rather ugly, and we're open to better solutions but it works for now. Reported-by: Stephane Eranian <eranian@google.com> Finished-by: Stephane Eranian <eranian@google.com> Reviewed-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c86
-rw-r--r--kernel/events/internal.h3
-rw-r--r--kernel/events/ring_buffer.c3
3 files changed, 90 insertions, 2 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0c1186fd97b..600c1629b64d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
185static void update_context_time(struct perf_event_context *ctx); 185static void update_context_time(struct perf_event_context *ctx);
186static u64 perf_event_time(struct perf_event *event); 186static u64 perf_event_time(struct perf_event *event);
187 187
188static void ring_buffer_attach(struct perf_event *event,
189 struct ring_buffer *rb);
190
188void __weak perf_event_print_debug(void) { } 191void __weak perf_event_print_debug(void) { }
189 192
190extern __weak const char *perf_pmu_name(void) 193extern __weak const char *perf_pmu_name(void)
@@ -3191,12 +3194,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3191 struct ring_buffer *rb; 3194 struct ring_buffer *rb;
3192 unsigned int events = POLL_HUP; 3195 unsigned int events = POLL_HUP;
3193 3196
3197 /*
3198 * Race between perf_event_set_output() and perf_poll(): perf_poll()
3199 * grabs the rb reference but perf_event_set_output() overrides it.
3200 * Here is the timeline for two threads T1, T2:
3201 * t0: T1, rb = rcu_dereference(event->rb)
3202 * t1: T2, old_rb = event->rb
3203 * t2: T2, event->rb = new rb
3204 * t3: T2, ring_buffer_detach(old_rb)
3205 * t4: T1, ring_buffer_attach(rb1)
3206 * t5: T1, poll_wait(event->waitq)
3207 *
3208 * To avoid this problem, we grab mmap_mutex in perf_poll()
3209 * thereby ensuring that the assignment of the new ring buffer
3210 * and the detachment of the old buffer appear atomic to perf_poll()
3211 */
3212 mutex_lock(&event->mmap_mutex);
3213
3194 rcu_read_lock(); 3214 rcu_read_lock();
3195 rb = rcu_dereference(event->rb); 3215 rb = rcu_dereference(event->rb);
3196 if (rb) 3216 if (rb) {
3217 ring_buffer_attach(event, rb);
3197 events = atomic_xchg(&rb->poll, 0); 3218 events = atomic_xchg(&rb->poll, 0);
3219 }
3198 rcu_read_unlock(); 3220 rcu_read_unlock();
3199 3221
3222 mutex_unlock(&event->mmap_mutex);
3223
3200 poll_wait(file, &event->waitq, wait); 3224 poll_wait(file, &event->waitq, wait);
3201 3225
3202 return events; 3226 return events;
@@ -3497,6 +3521,49 @@ unlock:
3497 return ret; 3521 return ret;
3498} 3522}
3499 3523
3524static void ring_buffer_attach(struct perf_event *event,
3525 struct ring_buffer *rb)
3526{
3527 unsigned long flags;
3528
3529 if (!list_empty(&event->rb_entry))
3530 return;
3531
3532 spin_lock_irqsave(&rb->event_lock, flags);
3533 if (!list_empty(&event->rb_entry))
3534 goto unlock;
3535
3536 list_add(&event->rb_entry, &rb->event_list);
3537unlock:
3538 spin_unlock_irqrestore(&rb->event_lock, flags);
3539}
3540
3541static void ring_buffer_detach(struct perf_event *event,
3542 struct ring_buffer *rb)
3543{
3544 unsigned long flags;
3545
3546 if (list_empty(&event->rb_entry))
3547 return;
3548
3549 spin_lock_irqsave(&rb->event_lock, flags);
3550 list_del_init(&event->rb_entry);
3551 wake_up_all(&event->waitq);
3552 spin_unlock_irqrestore(&rb->event_lock, flags);
3553}
3554
3555static void ring_buffer_wakeup(struct perf_event *event)
3556{
3557 struct ring_buffer *rb;
3558
3559 rcu_read_lock();
3560 rb = rcu_dereference(event->rb);
3561 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3562 wake_up_all(&event->waitq);
3563 }
3564 rcu_read_unlock();
3565}
3566
3500static void rb_free_rcu(struct rcu_head *rcu_head) 3567static void rb_free_rcu(struct rcu_head *rcu_head)
3501{ 3568{
3502 struct ring_buffer *rb; 3569 struct ring_buffer *rb;
@@ -3522,9 +3589,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3522 3589
3523static void ring_buffer_put(struct ring_buffer *rb) 3590static void ring_buffer_put(struct ring_buffer *rb)
3524{ 3591{
3592 struct perf_event *event, *n;
3593 unsigned long flags;
3594
3525 if (!atomic_dec_and_test(&rb->refcount)) 3595 if (!atomic_dec_and_test(&rb->refcount))
3526 return; 3596 return;
3527 3597
3598 spin_lock_irqsave(&rb->event_lock, flags);
3599 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3600 list_del_init(&event->rb_entry);
3601 wake_up_all(&event->waitq);
3602 }
3603 spin_unlock_irqrestore(&rb->event_lock, flags);
3604
3528 call_rcu(&rb->rcu_head, rb_free_rcu); 3605 call_rcu(&rb->rcu_head, rb_free_rcu);
3529} 3606}
3530 3607
@@ -3547,6 +3624,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3547 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3624 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3548 vma->vm_mm->pinned_vm -= event->mmap_locked; 3625 vma->vm_mm->pinned_vm -= event->mmap_locked;
3549 rcu_assign_pointer(event->rb, NULL); 3626 rcu_assign_pointer(event->rb, NULL);
3627 ring_buffer_detach(event, rb);
3550 mutex_unlock(&event->mmap_mutex); 3628 mutex_unlock(&event->mmap_mutex);
3551 3629
3552 ring_buffer_put(rb); 3630 ring_buffer_put(rb);
@@ -3701,7 +3779,7 @@ static const struct file_operations perf_fops = {
3701 3779
3702void perf_event_wakeup(struct perf_event *event) 3780void perf_event_wakeup(struct perf_event *event)
3703{ 3781{
3704 wake_up_all(&event->waitq); 3782 ring_buffer_wakeup(event);
3705 3783
3706 if (event->pending_kill) { 3784 if (event->pending_kill) {
3707 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 3785 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -5823,6 +5901,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5823 INIT_LIST_HEAD(&event->group_entry); 5901 INIT_LIST_HEAD(&event->group_entry);
5824 INIT_LIST_HEAD(&event->event_entry); 5902 INIT_LIST_HEAD(&event->event_entry);
5825 INIT_LIST_HEAD(&event->sibling_list); 5903 INIT_LIST_HEAD(&event->sibling_list);
5904 INIT_LIST_HEAD(&event->rb_entry);
5905
5826 init_waitqueue_head(&event->waitq); 5906 init_waitqueue_head(&event->waitq);
5827 init_irq_work(&event->pending, perf_pending_event); 5907 init_irq_work(&event->pending, perf_pending_event);
5828 5908
@@ -6029,6 +6109,8 @@ set:
6029 6109
6030 old_rb = event->rb; 6110 old_rb = event->rb;
6031 rcu_assign_pointer(event->rb, rb); 6111 rcu_assign_pointer(event->rb, rb);
6112 if (old_rb)
6113 ring_buffer_detach(event, old_rb);
6032 ret = 0; 6114 ret = 0;
6033unlock: 6115unlock:
6034 mutex_unlock(&event->mmap_mutex); 6116 mutex_unlock(&event->mmap_mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09097dd8116c..64568a699375 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -22,6 +22,9 @@ struct ring_buffer {
22 local_t lost; /* nr records lost */ 22 local_t lost; /* nr records lost */
23 23
24 long watermark; /* wakeup watermark */ 24 long watermark; /* wakeup watermark */
25 /* poll crap */
26 spinlock_t event_lock;
27 struct list_head event_list;
25 28
26 struct perf_event_mmap_page *user_page; 29 struct perf_event_mmap_page *user_page;
27 void *data_pages[0]; 30 void *data_pages[0];
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0f..7f3011c6b57f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
209 rb->writable = 1; 209 rb->writable = 1;
210 210
211 atomic_set(&rb->refcount, 1); 211 atomic_set(&rb->refcount, 1);
212
213 INIT_LIST_HEAD(&rb->event_list);
214 spin_lock_init(&rb->event_lock);
212} 215}
213 216
214#ifndef CONFIG_PERF_USE_VMALLOC 217#ifndef CONFIG_PERF_USE_VMALLOC