aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/builtin-stat.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-06-29 07:13:21 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-29 16:38:09 -0400
commit051ae7f7344f453616b6b10332d4d8e1d40ed823 (patch)
tree7d03f94c0a219b52fa604358c2ed835c1c861b35 /tools/perf/builtin-stat.c
parent210ad39fb7ef0bc0494483f517f42524f16bb2a7 (diff)
perf_counter tools: Reduce perf stat measurement overhead/skew
Vince Weaver reported a 'perf stat' measurement overhead in the count of retired instructions, which can amount to a +6000 instructions inflated count in the reported count. At present, perf stat creates its counters on the perf process. Thus the counters count the fork and various other activity in both the parent and child, such as the resolver overhead for resolving PLT entries for any libc functions that haven't been called before, such as execvp. This reduces the overhead by creating the counters on the child process after the fork, using a couple of pipes to synchronize so that the child process waits until the parent has created the counters before doing the exec. To eliminate the PLT resolution overhead on calling execvp, this does a dummy execvp first which will always fail. With this, the overhead of executing a program goes down from over 4800 instructions to about 90 instructions on powerpc (32-bit). This was measured with a statically-linked program written in assembler which only does the 3 instructions needed to call _exit(0). Before: $ perf stat -e 0:1:u ./three Performance counter stats for './three': 4858 instructions 0.001274523 seconds time elapsed After: $ perf stat -e 0:1:u ./three Performance counter stats for './three': 92 instructions 0.000468153 seconds time elapsed Reported-by: Vince Weaver <vince@deater.net> Signed-off-by: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <19016.41425.814043.870352@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r--tools/perf/builtin-stat.c64
1 files changed, 50 insertions, 14 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c5a290727a92..201ef2367dcb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -99,7 +99,7 @@ static u64 runtime_cycles_noise;
99#define ERR_PERF_OPEN \ 99#define ERR_PERF_OPEN \
100"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n" 100"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"
101 101
102static void create_perf_stat_counter(int counter) 102static void create_perf_stat_counter(int counter, int pid)
103{ 103{
104 struct perf_counter_attr *attr = attrs + counter; 104 struct perf_counter_attr *attr = attrs + counter;
105 105
@@ -119,7 +119,7 @@ static void create_perf_stat_counter(int counter)
119 attr->inherit = inherit; 119 attr->inherit = inherit;
120 attr->disabled = 1; 120 attr->disabled = 1;
121 121
122 fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0); 122 fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
123 if (fd[0][counter] < 0 && verbose) 123 if (fd[0][counter] < 0 && verbose)
124 fprintf(stderr, ERR_PERF_OPEN, counter, 124 fprintf(stderr, ERR_PERF_OPEN, counter,
125 fd[0][counter], strerror(errno)); 125 fd[0][counter], strerror(errno));
@@ -205,12 +205,58 @@ static int run_perf_stat(int argc, const char **argv)
205 int status = 0; 205 int status = 0;
206 int counter; 206 int counter;
207 int pid; 207 int pid;
208 int child_ready_pipe[2], go_pipe[2];
209 char buf;
208 210
209 if (!system_wide) 211 if (!system_wide)
210 nr_cpus = 1; 212 nr_cpus = 1;
211 213
214 if (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0) {
215 perror("failed to create pipes");
216 exit(1);
217 }
218
219 if ((pid = fork()) < 0)
220 perror("failed to fork");
221
222 if (!pid) {
223 close(child_ready_pipe[0]);
224 close(go_pipe[1]);
225 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
226
227 /*
228 * Do a dummy execvp to get the PLT entry resolved,
229 * so we avoid the resolver overhead on the real
230 * execvp call.
231 */
232 execvp("", (char **)argv);
233
234 /*
235 * Tell the parent we're ready to go
236 */
237 close(child_ready_pipe[1]);
238
239 /*
240 * Wait until the parent tells us to go.
241 */
242 read(go_pipe[0], &buf, 1);
243
244 execvp(argv[0], (char **)argv);
245
246 perror(argv[0]);
247 exit(-1);
248 }
249
250 /*
251 * Wait for the child to be ready to exec.
252 */
253 close(child_ready_pipe[1]);
254 close(go_pipe[0]);
255 read(child_ready_pipe[0], &buf, 1);
256 close(child_ready_pipe[0]);
257
212 for (counter = 0; counter < nr_counters; counter++) 258 for (counter = 0; counter < nr_counters; counter++)
213 create_perf_stat_counter(counter); 259 create_perf_stat_counter(counter, pid);
214 260
215 /* 261 /*
216 * Enable counters and exec the command: 262 * Enable counters and exec the command:
@@ -218,19 +264,9 @@ static int run_perf_stat(int argc, const char **argv)
218 t0 = rdclock(); 264 t0 = rdclock();
219 prctl(PR_TASK_PERF_COUNTERS_ENABLE); 265 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
220 266
221 if ((pid = fork()) < 0) 267 close(go_pipe[1]);
222 perror("failed to fork");
223
224 if (!pid) {
225 if (execvp(argv[0], (char **)argv)) {
226 perror(argv[0]);
227 exit(-1);
228 }
229 }
230
231 wait(&status); 268 wait(&status);
232 269
233 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
234 t1 = rdclock(); 270 t1 = rdclock();
235 271
236 walltime_nsecs[run_idx] = t1 - t0; 272 walltime_nsecs[run_idx] = t1 - t0;