aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2015-10-22 18:07:21 -0400
committerIngo Molnar <mingo@kernel.org>2015-11-23 03:58:25 -0500
commit75925e1ad7f5a4e867bd14ff8e7f114ea1596434 (patch)
treea526761994dbe3c9db832b7e61defd95755669e7
parent10013ebb5d7856c243541870f4e62fed68253e88 (diff)
perf/x86: Optimize stack walk user accesses
Change the perf user stack walking to use the new __copy_from_user_nmi(), and split each access into word sized transfer sizes. This allows to inline the complete access and optimize it all into a single load. The main advantage is that this avoids the overhead of double page faults. When normal copy_from_user() fails it reexecutes the copy to compute an accurate number of non copied bytes. This leads to executing the expensive page fault twice. While walking stacks having a fault at some point is relatively common (typically when some part of the program isn't compiled with frame pointers), so this is a large overhead. With the optimized copies we avoid this problem because they only do all accesses once. And of course they're much faster too when the access does not fault because they're just single instructions instead of complex function calls. While profiling a kernel build with -g, the patch brings down the average time of the PMI handler from 966ns to 552ns (-43%). Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vince Weaver <vincent.weaver@maine.edu> Link: http://lkml.kernel.org/r/1445551641-13379-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/perf_event.c22
1 files changed, 19 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bf79d7c97df..9dfbba5ce6e8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2250,12 +2250,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2250 ss_base = get_segment_base(regs->ss); 2250 ss_base = get_segment_base(regs->ss);
2251 2251
2252 fp = compat_ptr(ss_base + regs->bp); 2252 fp = compat_ptr(ss_base + regs->bp);
2253 pagefault_disable();
2253 while (entry->nr < PERF_MAX_STACK_DEPTH) { 2254 while (entry->nr < PERF_MAX_STACK_DEPTH) {
2254 unsigned long bytes; 2255 unsigned long bytes;
2255 frame.next_frame = 0; 2256 frame.next_frame = 0;
2256 frame.return_address = 0; 2257 frame.return_address = 0;
2257 2258
2258 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); 2259 if (!access_ok(VERIFY_READ, fp, 8))
2260 break;
2261
2262 bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
2263 if (bytes != 0)
2264 break;
2265 bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
2259 if (bytes != 0) 2266 if (bytes != 0)
2260 break; 2267 break;
2261 2268
@@ -2265,6 +2272,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2265 perf_callchain_store(entry, cs_base + frame.return_address); 2272 perf_callchain_store(entry, cs_base + frame.return_address);
2266 fp = compat_ptr(ss_base + frame.next_frame); 2273 fp = compat_ptr(ss_base + frame.next_frame);
2267 } 2274 }
2275 pagefault_enable();
2268 return 1; 2276 return 1;
2269} 2277}
2270#else 2278#else
@@ -2302,12 +2310,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2302 if (perf_callchain_user32(regs, entry)) 2310 if (perf_callchain_user32(regs, entry))
2303 return; 2311 return;
2304 2312
2313 pagefault_disable();
2305 while (entry->nr < PERF_MAX_STACK_DEPTH) { 2314 while (entry->nr < PERF_MAX_STACK_DEPTH) {
2306 unsigned long bytes; 2315 unsigned long bytes;
2307 frame.next_frame = NULL; 2316 frame.next_frame = NULL;
2308 frame.return_address = 0; 2317 frame.return_address = 0;
2309 2318
2310 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); 2319 if (!access_ok(VERIFY_READ, fp, 16))
2320 break;
2321
2322 bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
2323 if (bytes != 0)
2324 break;
2325 bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
2311 if (bytes != 0) 2326 if (bytes != 0)
2312 break; 2327 break;
2313 2328
@@ -2315,8 +2330,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2315 break; 2330 break;
2316 2331
2317 perf_callchain_store(entry, frame.return_address); 2332 perf_callchain_store(entry, frame.return_address);
2318 fp = frame.next_frame; 2333 fp = (void __user *)frame.next_frame;
2319 } 2334 }
2335 pagefault_enable();
2320} 2336}
2321 2337
2322/* 2338/*