diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-01 15:54:51 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-01 15:54:51 -0400 |
commit | 9581b7d2689ad9a0a20ddaec0427b46ad774585b (patch) | |
tree | f2eb9c9e4f2066c2ff1a7a3853ee6231ef879c11 | |
parent | 9119e33e507d3b8fe234f2c2bafd3199ea2e883b (diff) | |
parent | e8a923cc1fff6e627f906655ad52ee694ef2f6d7 (diff) |
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Ingo Molnar:
"Two fixes:
- Fix 'NMI handler took too long to run' false positives
[ Genuine NMI overhead speedups will come for v3.13, this commit
only fixes a measurement bug ]
- Fix perf ring-buffer missed barrier causing (rare) ring-buffer data
corruption on ppc64"
* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86: Fix NMI measurements
perf: Fix perf ring buffer memory ordering
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 4 | ||||
-rw-r--r-- | include/uapi/linux/perf_event.h | 12 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 31 |
4 files changed, 39 insertions, 14 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9d8449158cf9..8a87a3224121 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void) | |||
1276 | static int __kprobes | 1276 | static int __kprobes |
1277 | perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) | 1277 | perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) |
1278 | { | 1278 | { |
1279 | int ret; | ||
1280 | u64 start_clock; | 1279 | u64 start_clock; |
1281 | u64 finish_clock; | 1280 | u64 finish_clock; |
1281 | int ret; | ||
1282 | 1282 | ||
1283 | if (!atomic_read(&active_events)) | 1283 | if (!atomic_read(&active_events)) |
1284 | return NMI_DONE; | 1284 | return NMI_DONE; |
1285 | 1285 | ||
1286 | start_clock = local_clock(); | 1286 | start_clock = sched_clock(); |
1287 | ret = x86_pmu.handle_irq(regs); | 1287 | ret = x86_pmu.handle_irq(regs); |
1288 | finish_clock = local_clock(); | 1288 | finish_clock = sched_clock(); |
1289 | 1289 | ||
1290 | perf_sample_event_took(finish_clock - start_clock); | 1290 | perf_sample_event_took(finish_clock - start_clock); |
1291 | 1291 | ||
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ba77ebc2c353..6fcb49ce50a1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 | |||
113 | u64 before, delta, whole_msecs; | 113 | u64 before, delta, whole_msecs; |
114 | int remainder_ns, decimal_msecs, thishandled; | 114 | int remainder_ns, decimal_msecs, thishandled; |
115 | 115 | ||
116 | before = local_clock(); | 116 | before = sched_clock(); |
117 | thishandled = a->handler(type, regs); | 117 | thishandled = a->handler(type, regs); |
118 | handled += thishandled; | 118 | handled += thishandled; |
119 | delta = local_clock() - before; | 119 | delta = sched_clock() - before; |
120 | trace_nmi_handler(a->handler, (int)delta, thishandled); | 120 | trace_nmi_handler(a->handler, (int)delta, thishandled); |
121 | 121 | ||
122 | if (delta < nmi_longest_ns) | 122 | if (delta < nmi_longest_ns) |
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 009a655a5d35..2fc1602e23bb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h | |||
@@ -456,13 +456,15 @@ struct perf_event_mmap_page { | |||
456 | /* | 456 | /* |
457 | * Control data for the mmap() data buffer. | 457 | * Control data for the mmap() data buffer. |
458 | * | 458 | * |
459 | * User-space reading the @data_head value should issue an rmb(), on | 459 | * User-space reading the @data_head value should issue an smp_rmb(), |
460 | * SMP capable platforms, after reading this value -- see | 460 | * after reading this value. |
461 | * perf_event_wakeup(). | ||
462 | * | 461 | * |
463 | * When the mapping is PROT_WRITE the @data_tail value should be | 462 | * When the mapping is PROT_WRITE the @data_tail value should be |
464 | * written by userspace to reflect the last read data. In this case | 463 | * written by userspace to reflect the last read data, after issueing |
465 | * the kernel will not over-write unread data. | 464 | * an smp_mb() to separate the data read from the ->data_tail store. |
465 | * In this case the kernel will not over-write unread data. | ||
466 | * | ||
467 | * See perf_output_put_handle() for the data ordering. | ||
466 | */ | 468 | */ |
467 | __u64 data_head; /* head in the data section */ | 469 | __u64 data_head; /* head in the data section */ |
468 | __u64 data_tail; /* user-space written tail */ | 470 | __u64 data_tail; /* user-space written tail */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..9c2ddfbf4525 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -87,10 +87,31 @@ again: | |||
87 | goto out; | 87 | goto out; |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Publish the known good head. Rely on the full barrier implied | 90 | * Since the mmap() consumer (userspace) can run on a different CPU: |
91 | * by atomic_dec_and_test() order the rb->head read and this | 91 | * |
92 | * write. | 92 | * kernel user |
93 | * | ||
94 | * READ ->data_tail READ ->data_head | ||
95 | * smp_mb() (A) smp_rmb() (C) | ||
96 | * WRITE $data READ $data | ||
97 | * smp_wmb() (B) smp_mb() (D) | ||
98 | * STORE ->data_head WRITE ->data_tail | ||
99 | * | ||
100 | * Where A pairs with D, and B pairs with C. | ||
101 | * | ||
102 | * I don't think A needs to be a full barrier because we won't in fact | ||
103 | * write data until we see the store from userspace. So we simply don't | ||
104 | * issue the data WRITE until we observe it. Be conservative for now. | ||
105 | * | ||
106 | * OTOH, D needs to be a full barrier since it separates the data READ | ||
107 | * from the tail WRITE. | ||
108 | * | ||
109 | * For B a WMB is sufficient since it separates two WRITEs, and for C | ||
110 | * an RMB is sufficient since it separates two READs. | ||
111 | * | ||
112 | * See perf_output_begin(). | ||
93 | */ | 113 | */ |
114 | smp_wmb(); | ||
94 | rb->user_page->data_head = head; | 115 | rb->user_page->data_head = head; |
95 | 116 | ||
96 | /* | 117 | /* |
@@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
154 | * Userspace could choose to issue a mb() before updating the | 175 | * Userspace could choose to issue a mb() before updating the |
155 | * tail pointer. So that all reads will be completed before the | 176 | * tail pointer. So that all reads will be completed before the |
156 | * write is issued. | 177 | * write is issued. |
178 | * | ||
179 | * See perf_output_put_handle(). | ||
157 | */ | 180 | */ |
158 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 181 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
159 | smp_rmb(); | 182 | smp_mb(); |
160 | offset = head = local_read(&rb->head); | 183 | offset = head = local_read(&rb->head); |
161 | head += size; | 184 | head += size; |
162 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 185 | if (unlikely(!perf_output_space(rb, tail, offset, head))) |