aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2015-02-20 08:05:38 -0500
committerIngo Molnar <mingo@kernel.org>2015-03-27 05:13:22 -0400
commit34f439278cef7b1177f8ce24f9fc81dfc6221d3b (patch)
tree8bd86bf3d73aff36e8bee13c0102c7ae7e44e40c
parentb381e63b48a0b6befc7b4e55408c39012a0dcf8c (diff)
perf: Add per event clockid support
While thinking on the whole clock discussion it occurred to me we have two distinct uses of time: 1) the tracking of event/ctx/cgroup enabled/running/stopped times which includes the self-monitoring support in struct perf_event_mmap_page. 2) the actual timestamps visible in the data records. And we've been conflating them. The first is all about tracking time deltas, nobody should really care in what time base that happens, its all relative information, as long as its internally consistent it works. The second however is what people are worried about when having to merge their data with external sources. And here we have the discussion on MONOTONIC vs MONOTONIC_RAW etc.. Where MONOTONIC is good for correlating between machines (static offset), MONOTNIC_RAW is required for correlating against a fixed rate hardware clock. This means configurability; now 1) makes that hard because it needs to be internally consistent across groups of unrelated events; which is why we had to have a global perf_clock(). However, for 2) it doesn't really matter, perf itself doesn't care what it writes into the buffer. The below patch makes the distinction between these two cases by adding perf_event_clock() which is used for the second case. It further makes this configurable on a per-event basis, but adds a few sanity checks such that we cannot combine events with different clocks in confusing ways. And since we then have per-event configurability we might as well retain the 'legacy' behaviour as a default. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/perf_event.c14
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/uapi/linux/perf_event.h6
-rw-r--r--kernel/events/core.c77
4 files changed, 91 insertions, 8 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
1978 1978
1979 data = cyc2ns_read_begin(); 1979 data = cyc2ns_read_begin();
1980 1980
1981 /*
1982 * Internal timekeeping for enabled/running/stopped times
1983 * is always in the local_clock domain.
1984 */
1981 userpg->cap_user_time = 1; 1985 userpg->cap_user_time = 1;
1982 userpg->time_mult = data->cyc2ns_mul; 1986 userpg->time_mult = data->cyc2ns_mul;
1983 userpg->time_shift = data->cyc2ns_shift; 1987 userpg->time_shift = data->cyc2ns_shift;
1984 userpg->time_offset = data->cyc2ns_offset - now; 1988 userpg->time_offset = data->cyc2ns_offset - now;
1985 1989
1986 userpg->cap_user_time_zero = 1; 1990 /*
1987 userpg->time_zero = data->cyc2ns_offset; 1991 * cap_user_time_zero doesn't make sense when we're using a different
1992 * time base for the records.
1993 */
1994 if (event->clock == &local_clock) {
1995 userpg->cap_user_time_zero = 1;
1996 userpg->time_zero = data->cyc2ns_offset;
1997 }
1988 1998
1989 cyc2ns_read_end(data); 1999 cyc2ns_read_end(data);
1990} 2000}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
173 * pmu::capabilities flags 173 * pmu::capabilities flags
174 */ 174 */
175#define PERF_PMU_CAP_NO_INTERRUPT 0x01 175#define PERF_PMU_CAP_NO_INTERRUPT 0x01
176#define PERF_PMU_CAP_NO_NMI 0x02
176 177
177/** 178/**
178 * struct pmu - generic performance monitoring unit 179 * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
457 struct pid_namespace *ns; 458 struct pid_namespace *ns;
458 u64 id; 459 u64 id;
459 460
461 u64 (*clock)(void);
460 perf_overflow_handler_t overflow_handler; 462 perf_overflow_handler_t overflow_handler;
461 void *overflow_handler_context; 463 void *overflow_handler_context;
462 464
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3bb40ddadbe5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -326,7 +326,8 @@ struct perf_event_attr {
326 exclude_callchain_user : 1, /* exclude user callchains */ 326 exclude_callchain_user : 1, /* exclude user callchains */
327 mmap2 : 1, /* include mmap with inode data */ 327 mmap2 : 1, /* include mmap with inode data */
328 comm_exec : 1, /* flag comm events that are due to an exec */ 328 comm_exec : 1, /* flag comm events that are due to an exec */
329 __reserved_1 : 39; 329 use_clockid : 1, /* use @clockid for time fields */
330 __reserved_1 : 38;
330 331
331 union { 332 union {
332 __u32 wakeup_events; /* wakeup every n events */ 333 __u32 wakeup_events; /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
355 */ 356 */
356 __u32 sample_stack_user; 357 __u32 sample_stack_user;
357 358
358 /* Align to u64. */ 359 __s32 clockid;
359 __u32 __reserved_2;
360 /* 360 /*
361 * Defines set of regs to dump for each sample 361 * Defines set of regs to dump for each sample
362 * state captured on: 362 * state captured on:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
327 return local_clock(); 327 return local_clock();
328} 328}
329 329
330static inline u64 perf_event_clock(struct perf_event *event)
331{
332 return event->clock();
333}
334
330static inline struct perf_cpu_context * 335static inline struct perf_cpu_context *
331__get_cpu_context(struct perf_event_context *ctx) 336__get_cpu_context(struct perf_event_context *ctx)
332{ 337{
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4762 } 4767 }
4763 4768
4764 if (sample_type & PERF_SAMPLE_TIME) 4769 if (sample_type & PERF_SAMPLE_TIME)
4765 data->time = perf_clock(); 4770 data->time = perf_event_clock(event);
4766 4771
4767 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) 4772 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4768 data->id = primary_event_id(event); 4773 data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
5340 task_event->event_id.tid = perf_event_tid(event, task); 5345 task_event->event_id.tid = perf_event_tid(event, task);
5341 task_event->event_id.ptid = perf_event_tid(event, current); 5346 task_event->event_id.ptid = perf_event_tid(event, current);
5342 5347
5348 task_event->event_id.time = perf_event_clock(event);
5349
5343 perf_output_put(&handle, task_event->event_id); 5350 perf_output_put(&handle, task_event->event_id);
5344 5351
5345 perf_event__output_id_sample(event, &handle, &sample); 5352 perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
5373 /* .ppid */ 5380 /* .ppid */
5374 /* .tid */ 5381 /* .tid */
5375 /* .ptid */ 5382 /* .ptid */
5376 .time = perf_clock(), 5383 /* .time */
5377 }, 5384 },
5378 }; 5385 };
5379 5386
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
5749 .misc = 0, 5756 .misc = 0,
5750 .size = sizeof(throttle_event), 5757 .size = sizeof(throttle_event),
5751 }, 5758 },
5752 .time = perf_clock(), 5759 .time = perf_event_clock(event),
5753 .id = primary_event_id(event), 5760 .id = primary_event_id(event),
5754 .stream_id = event->id, 5761 .stream_id = event->id,
5755 }; 5762 };
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
6293static struct pmu perf_swevent = { 6300static struct pmu perf_swevent = {
6294 .task_ctx_nr = perf_sw_context, 6301 .task_ctx_nr = perf_sw_context,
6295 6302
6303 .capabilities = PERF_PMU_CAP_NO_NMI,
6304
6296 .event_init = perf_swevent_init, 6305 .event_init = perf_swevent_init,
6297 .add = perf_swevent_add, 6306 .add = perf_swevent_add,
6298 .del = perf_swevent_del, 6307 .del = perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
6636static struct pmu perf_cpu_clock = { 6645static struct pmu perf_cpu_clock = {
6637 .task_ctx_nr = perf_sw_context, 6646 .task_ctx_nr = perf_sw_context,
6638 6647
6648 .capabilities = PERF_PMU_CAP_NO_NMI,
6649
6639 .event_init = cpu_clock_event_init, 6650 .event_init = cpu_clock_event_init,
6640 .add = cpu_clock_event_add, 6651 .add = cpu_clock_event_add,
6641 .del = cpu_clock_event_del, 6652 .del = cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
6715static struct pmu perf_task_clock = { 6726static struct pmu perf_task_clock = {
6716 .task_ctx_nr = perf_sw_context, 6727 .task_ctx_nr = perf_sw_context,
6717 6728
6729 .capabilities = PERF_PMU_CAP_NO_NMI,
6730
6718 .event_init = task_clock_event_init, 6731 .event_init = task_clock_event_init,
6719 .add = task_clock_event_add, 6732 .add = task_clock_event_add,
6720 .del = task_clock_event_del, 6733 .del = task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
7200 event->hw.target = task; 7213 event->hw.target = task;
7201 } 7214 }
7202 7215
7216 event->clock = &local_clock;
7217 if (parent_event)
7218 event->clock = parent_event->clock;
7219
7203 if (!overflow_handler && parent_event) { 7220 if (!overflow_handler && parent_event) {
7204 overflow_handler = parent_event->overflow_handler; 7221 overflow_handler = parent_event->overflow_handler;
7205 context = parent_event->overflow_handler_context; 7222 context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7422 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 7439 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7423 goto out; 7440 goto out;
7424 7441
7442 /*
7443 * Mixing clocks in the same buffer is trouble you don't need.
7444 */
7445 if (output_event->clock != event->clock)
7446 goto out;
7447
7425set: 7448set:
7426 mutex_lock(&event->mmap_mutex); 7449 mutex_lock(&event->mmap_mutex);
7427 /* Can't redirect output if we've got an active mmap() */ 7450 /* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
7454 mutex_lock_nested(b, SINGLE_DEPTH_NESTING); 7477 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7455} 7478}
7456 7479
7480static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7481{
7482 bool nmi_safe = false;
7483
7484 switch (clk_id) {
7485 case CLOCK_MONOTONIC:
7486 event->clock = &ktime_get_mono_fast_ns;
7487 nmi_safe = true;
7488 break;
7489
7490 case CLOCK_MONOTONIC_RAW:
7491 event->clock = &ktime_get_raw_fast_ns;
7492 nmi_safe = true;
7493 break;
7494
7495 case CLOCK_REALTIME:
7496 event->clock = &ktime_get_real_ns;
7497 break;
7498
7499 case CLOCK_BOOTTIME:
7500 event->clock = &ktime_get_boot_ns;
7501 break;
7502
7503 case CLOCK_TAI:
7504 event->clock = &ktime_get_tai_ns;
7505 break;
7506
7507 default:
7508 return -EINVAL;
7509 }
7510
7511 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7512 return -EINVAL;
7513
7514 return 0;
7515}
7516
7457/** 7517/**
7458 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7518 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7459 * 7519 *
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
7569 */ 7629 */
7570 pmu = event->pmu; 7630 pmu = event->pmu;
7571 7631
7632 if (attr.use_clockid) {
7633 err = perf_event_set_clock(event, attr.clockid);
7634 if (err)
7635 goto err_alloc;
7636 }
7637
7572 if (group_leader && 7638 if (group_leader &&
7573 (is_software_event(event) != is_software_event(group_leader))) { 7639 (is_software_event(event) != is_software_event(group_leader))) {
7574 if (is_software_event(event)) { 7640 if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
7618 */ 7684 */
7619 if (group_leader->group_leader != group_leader) 7685 if (group_leader->group_leader != group_leader)
7620 goto err_context; 7686 goto err_context;
7687
7688 /* All events in a group should have the same clock */
7689 if (group_leader->clock != event->clock)
7690 goto err_context;
7691
7621 /* 7692 /*
7622 * Do not allow to attach to a group in a different 7693 * Do not allow to attach to a group in a different
7623 * task or CPU context: 7694 * task or CPU context: