aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-01 15:54:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-01 15:54:51 -0400
commit9581b7d2689ad9a0a20ddaec0427b46ad774585b (patch)
treef2eb9c9e4f2066c2ff1a7a3853ee6231ef879c11
parent9119e33e507d3b8fe234f2c2bafd3199ea2e883b (diff)
parente8a923cc1fff6e627f906655ad52ee694ef2f6d7 (diff)
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Ingo Molnar: "Two fixes: - Fix 'NMI handler took too long to run' false positives [ Genuine NMI overhead speedups will come for v3.13, this commit only fixes a measurement bug ] - Fix perf ring-buffer missed barrier causing (rare) ring-buffer data corruption on ppc64" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86: Fix NMI measurements perf: Fix perf ring buffer memory ordering
-rw-r--r--arch/x86/kernel/cpu/perf_event.c6
-rw-r--r--arch/x86/kernel/nmi.c4
-rw-r--r--include/uapi/linux/perf_event.h12
-rw-r--r--kernel/events/ring_buffer.c31
4 files changed, 39 insertions, 14 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d8449158cf9..8a87a3224121 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void)
1276static int __kprobes 1276static int __kprobes
1277perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1277perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1278{ 1278{
1279 int ret;
1280 u64 start_clock; 1279 u64 start_clock;
1281 u64 finish_clock; 1280 u64 finish_clock;
1281 int ret;
1282 1282
1283 if (!atomic_read(&active_events)) 1283 if (!atomic_read(&active_events))
1284 return NMI_DONE; 1284 return NMI_DONE;
1285 1285
1286 start_clock = local_clock(); 1286 start_clock = sched_clock();
1287 ret = x86_pmu.handle_irq(regs); 1287 ret = x86_pmu.handle_irq(regs);
1288 finish_clock = local_clock(); 1288 finish_clock = sched_clock();
1289 1289
1290 perf_sample_event_took(finish_clock - start_clock); 1290 perf_sample_event_took(finish_clock - start_clock);
1291 1291
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ba77ebc2c353..6fcb49ce50a1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
113 u64 before, delta, whole_msecs; 113 u64 before, delta, whole_msecs;
114 int remainder_ns, decimal_msecs, thishandled; 114 int remainder_ns, decimal_msecs, thishandled;
115 115
116 before = local_clock(); 116 before = sched_clock();
117 thishandled = a->handler(type, regs); 117 thishandled = a->handler(type, regs);
118 handled += thishandled; 118 handled += thishandled;
119 delta = local_clock() - before; 119 delta = sched_clock() - before;
120 trace_nmi_handler(a->handler, (int)delta, thishandled); 120 trace_nmi_handler(a->handler, (int)delta, thishandled);
121 121
122 if (delta < nmi_longest_ns) 122 if (delta < nmi_longest_ns)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 009a655a5d35..2fc1602e23bb 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -456,13 +456,15 @@ struct perf_event_mmap_page {
456 /* 456 /*
457 * Control data for the mmap() data buffer. 457 * Control data for the mmap() data buffer.
458 * 458 *
459 * User-space reading the @data_head value should issue an rmb(), on 459 * User-space reading the @data_head value should issue an smp_rmb(),
460 * SMP capable platforms, after reading this value -- see 460 * after reading this value.
461 * perf_event_wakeup().
462 * 461 *
463 * When the mapping is PROT_WRITE the @data_tail value should be 462 * When the mapping is PROT_WRITE the @data_tail value should be
464 * written by userspace to reflect the last read data. In this case 463 * written by userspace to reflect the last read data, after issueing
465 * the kernel will not over-write unread data. 464 * an smp_mb() to separate the data read from the ->data_tail store.
465 * In this case the kernel will not over-write unread data.
466 *
467 * See perf_output_put_handle() for the data ordering.
466 */ 468 */
467 __u64 data_head; /* head in the data section */ 469 __u64 data_head; /* head in the data section */
468 __u64 data_tail; /* user-space written tail */ 470 __u64 data_tail; /* user-space written tail */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..9c2ddfbf4525 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -87,10 +87,31 @@ again:
87 goto out; 87 goto out;
88 88
89 /* 89 /*
90 * Publish the known good head. Rely on the full barrier implied 90 * Since the mmap() consumer (userspace) can run on a different CPU:
91 * by atomic_dec_and_test() order the rb->head read and this 91 *
92 * write. 92 * kernel user
93 *
94 * READ ->data_tail READ ->data_head
95 * smp_mb() (A) smp_rmb() (C)
96 * WRITE $data READ $data
97 * smp_wmb() (B) smp_mb() (D)
98 * STORE ->data_head WRITE ->data_tail
99 *
100 * Where A pairs with D, and B pairs with C.
101 *
102 * I don't think A needs to be a full barrier because we won't in fact
103 * write data until we see the store from userspace. So we simply don't
104 * issue the data WRITE until we observe it. Be conservative for now.
105 *
106 * OTOH, D needs to be a full barrier since it separates the data READ
107 * from the tail WRITE.
108 *
109 * For B a WMB is sufficient since it separates two WRITEs, and for C
110 * an RMB is sufficient since it separates two READs.
111 *
112 * See perf_output_begin().
93 */ 113 */
114 smp_wmb();
94 rb->user_page->data_head = head; 115 rb->user_page->data_head = head;
95 116
96 /* 117 /*
@@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle,
154 * Userspace could choose to issue a mb() before updating the 175 * Userspace could choose to issue a mb() before updating the
155 * tail pointer. So that all reads will be completed before the 176 * tail pointer. So that all reads will be completed before the
156 * write is issued. 177 * write is issued.
178 *
179 * See perf_output_put_handle().
157 */ 180 */
158 tail = ACCESS_ONCE(rb->user_page->data_tail); 181 tail = ACCESS_ONCE(rb->user_page->data_tail);
159 smp_rmb(); 182 smp_mb();
160 offset = head = local_read(&rb->head); 183 offset = head = local_read(&rb->head);
161 head += size; 184 head += size;
162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 185 if (unlikely(!perf_output_space(rb, tail, offset, head)))