aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-01-15 14:37:43 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-01-15 14:37:43 -0500
commit79078c53baabee12dfefb0cfe00ca94cb2c35570 (patch)
treec8586ca3e125d757756b1b9a020615dcdbb09d0c
parent255e6140fa76ec9d0e24f201427e7e9a9573f681 (diff)
parent18e7a45af91acdde99d3aa1372cc40e1f8142f7b (diff)
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Ingo Molnar: "Misc race fixes uncovered by fuzzing efforts, a Sparse fix, two PMU driver fixes, plus miscellanous tooling fixes" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86: Reject non sampling events with precise_ip perf/x86/intel: Account interrupts for PEBS errors perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race perf/core: Fix sys_perf_event_open() vs. hotplug perf/x86/intel: Use ULL constant to prevent undefined shift behaviour perf/x86/intel/uncore: Fix hardcoded socket 0 assumption in the Haswell init code perf/x86: Set pmu->module in Intel PMU modules perf probe: Fix to probe on gcc generated symbols for offline kernel perf probe: Fix --funcs to show correct symbols for offline module perf symbols: Robustify reading of build-id from sysfs perf tools: Install tools/lib/traceevent plugins with install-bin tools lib traceevent: Fix prev/next_prio for deadline tasks perf record: Fix --switch-output documentation and comment perf record: Make __record_options static tools lib subcmd: Add OPT_STRING_OPTARG_SET option perf probe: Fix to get correct modname from elf header samples/bpf trace_output_user: Remove duplicate sys/ioctl.h include samples/bpf sock_example: Avoid getting ethhdr from two includes perf sched timehist: Show total scheduling time
-rw-r--r--arch/x86/events/core.c4
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/events/intel/cstate.c2
-rw-r--r--arch/x86/events/intel/ds.c6
-rw-r--r--arch/x86/events/intel/rapl.c1
-rw-r--r--arch/x86/events/intel/uncore.c1
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--include/linux/perf_event.h1
-rw-r--r--kernel/events/core.c175
-rw-r--r--samples/bpf/sock_example.h2
-rw-r--r--samples/bpf/trace_output_user.c1
-rw-r--r--tools/lib/subcmd/parse-options.c3
-rw-r--r--tools/lib/subcmd/parse-options.h5
-rw-r--r--tools/lib/traceevent/plugin_sched_switch.c4
-rw-r--r--tools/perf/Documentation/perf-record.txt4
-rw-r--r--tools/perf/Makefile.perf4
-rw-r--r--tools/perf/builtin-record.c4
-rw-r--r--tools/perf/builtin-sched.c17
-rw-r--r--tools/perf/util/probe-event.c105
-rw-r--r--tools/perf/util/symbol-elf.c6
20 files changed, 257 insertions, 92 deletions
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 019c5887b698..1635c0c8df23 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event)
505 505
506 if (event->attr.precise_ip > precise) 506 if (event->attr.precise_ip > precise)
507 return -EOPNOTSUPP; 507 return -EOPNOTSUPP;
508
509 /* There's no sense in having PEBS for non sampling events: */
510 if (!is_sampling_event(event))
511 return -EINVAL;
508 } 512 }
509 /* 513 /*
510 * check that PEBS LBR correction does not conflict with 514 * check that PEBS LBR correction does not conflict with
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 86138267b68a..d611cab214a6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3987,7 +3987,7 @@ __init int intel_pmu_init(void)
3987 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); 3987 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
3988 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; 3988 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
3989 } 3989 }
3990 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 3990 x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
3991 3991
3992 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { 3992 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
3993 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 3993 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index fec8a461bdef..1076c9a77292 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = {
434 .stop = cstate_pmu_event_stop, 434 .stop = cstate_pmu_event_stop,
435 .read = cstate_pmu_event_update, 435 .read = cstate_pmu_event_update,
436 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 436 .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
437 .module = THIS_MODULE,
437}; 438};
438 439
439static struct pmu cstate_pkg_pmu = { 440static struct pmu cstate_pkg_pmu = {
@@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = {
447 .stop = cstate_pmu_event_stop, 448 .stop = cstate_pmu_event_stop,
448 .read = cstate_pmu_event_update, 449 .read = cstate_pmu_event_update,
449 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 450 .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
451 .module = THIS_MODULE,
450}; 452};
451 453
452static const struct cstate_model nhm_cstates __initconst = { 454static const struct cstate_model nhm_cstates __initconst = {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..9dfeeeca0ea8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
1389 continue; 1389 continue;
1390 1390
1391 /* log dropped samples number */ 1391 /* log dropped samples number */
1392 if (error[bit]) 1392 if (error[bit]) {
1393 perf_log_lost_samples(event, error[bit]); 1393 perf_log_lost_samples(event, error[bit]);
1394 1394
1395 if (perf_event_account_interrupt(event))
1396 x86_pmu_stop(event, 0);
1397 }
1398
1395 if (counts[bit]) { 1399 if (counts[bit]) {
1396 __intel_pmu_pebs_event(event, iregs, base, 1400 __intel_pmu_pebs_event(event, iregs, base,
1397 top, bit, counts[bit]); 1401 top, bit, counts[bit]);
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index bd34124449b0..17c3564d087a 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void)
697 rapl_pmus->pmu.start = rapl_pmu_event_start; 697 rapl_pmus->pmu.start = rapl_pmu_event_start;
698 rapl_pmus->pmu.stop = rapl_pmu_event_stop; 698 rapl_pmus->pmu.stop = rapl_pmu_event_stop;
699 rapl_pmus->pmu.read = rapl_pmu_event_read; 699 rapl_pmus->pmu.read = rapl_pmu_event_read;
700 rapl_pmus->pmu.module = THIS_MODULE;
700 return 0; 701 return 0;
701} 702}
702 703
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 97c246f84dea..8c4ccdc3a3f3 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
733 .start = uncore_pmu_event_start, 733 .start = uncore_pmu_event_start,
734 .stop = uncore_pmu_event_stop, 734 .stop = uncore_pmu_event_stop,
735 .read = uncore_pmu_event_read, 735 .read = uncore_pmu_event_read,
736 .module = THIS_MODULE,
736 }; 737 };
737 } else { 738 } else {
738 pmu->pmu = *pmu->type->pmu; 739 pmu->pmu = *pmu->type->pmu;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index e6832be714bc..dae2fedc1601 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
2686 2686
2687void hswep_uncore_cpu_init(void) 2687void hswep_uncore_cpu_init(void)
2688{ 2688{
2689 int pkg = topology_phys_to_logical_pkg(0); 2689 int pkg = boot_cpu_data.logical_proc_id;
2690 2690
2691 if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) 2691 if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
2692 hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; 2692 hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
1259extern void perf_event_disable_local(struct perf_event *event); 1259extern void perf_event_disable_local(struct perf_event *event);
1260extern void perf_event_disable_inatomic(struct perf_event *event); 1260extern void perf_event_disable_inatomic(struct perf_event *event);
1261extern void perf_event_task_tick(void); 1261extern void perf_event_task_tick(void);
1262extern int perf_event_account_interrupt(struct perf_event *event);
1262#else /* !CONFIG_PERF_EVENTS: */ 1263#else /* !CONFIG_PERF_EVENTS: */
1263static inline void * 1264static inline void *
1264perf_aux_output_begin(struct perf_output_handle *handle, 1265perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..110b38a58493 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2249,7 +2249,7 @@ static int __perf_install_in_context(void *info)
2249 struct perf_event_context *ctx = event->ctx; 2249 struct perf_event_context *ctx = event->ctx;
2250 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2250 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2251 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2251 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2252 bool activate = true; 2252 bool reprogram = true;
2253 int ret = 0; 2253 int ret = 0;
2254 2254
2255 raw_spin_lock(&cpuctx->ctx.lock); 2255 raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,27 +2257,26 @@ static int __perf_install_in_context(void *info)
2257 raw_spin_lock(&ctx->lock); 2257 raw_spin_lock(&ctx->lock);
2258 task_ctx = ctx; 2258 task_ctx = ctx;
2259 2259
2260 /* If we're on the wrong CPU, try again */ 2260 reprogram = (ctx->task == current);
2261 if (task_cpu(ctx->task) != smp_processor_id()) {
2262 ret = -ESRCH;
2263 goto unlock;
2264 }
2265 2261
2266 /* 2262 /*
2267 * If we're on the right CPU, see if the task we target is 2263 * If the task is running, it must be running on this CPU,
2268 * current, if not we don't have to activate the ctx, a future 2264 * otherwise we cannot reprogram things.
2269 * context switch will do that for us. 2265 *
2266 * If its not running, we don't care, ctx->lock will
2267 * serialize against it becoming runnable.
2270 */ 2268 */
2271 if (ctx->task != current) 2269 if (task_curr(ctx->task) && !reprogram) {
2272 activate = false; 2270 ret = -ESRCH;
2273 else 2271 goto unlock;
2274 WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2272 }
2275 2273
2274 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2276 } else if (task_ctx) { 2275 } else if (task_ctx) {
2277 raw_spin_lock(&task_ctx->lock); 2276 raw_spin_lock(&task_ctx->lock);
2278 } 2277 }
2279 2278
2280 if (activate) { 2279 if (reprogram) {
2281 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2280 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2282 add_event_to_ctx(event, ctx); 2281 add_event_to_ctx(event, ctx);
2283 ctx_resched(cpuctx, task_ctx); 2282 ctx_resched(cpuctx, task_ctx);
@@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx,
2328 /* 2327 /*
2329 * Installing events is tricky because we cannot rely on ctx->is_active 2328 * Installing events is tricky because we cannot rely on ctx->is_active
2330 * to be set in case this is the nr_events 0 -> 1 transition. 2329 * to be set in case this is the nr_events 0 -> 1 transition.
2330 *
2331 * Instead we use task_curr(), which tells us if the task is running.
2332 * However, since we use task_curr() outside of rq::lock, we can race
2333 * against the actual state. This means the result can be wrong.
2334 *
2335 * If we get a false positive, we retry, this is harmless.
2336 *
2337 * If we get a false negative, things are complicated. If we are after
2338 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2339 * value must be correct. If we're before, it doesn't matter since
2340 * perf_event_context_sched_in() will program the counter.
2341 *
2342 * However, this hinges on the remote context switch having observed
2343 * our task->perf_event_ctxp[] store, such that it will in fact take
2344 * ctx::lock in perf_event_context_sched_in().
2345 *
2346 * We do this by task_function_call(), if the IPI fails to hit the task
2347 * we know any future context switch of task must see the
2348 * perf_event_ctpx[] store.
2331 */ 2349 */
2332again: 2350
2333 /* 2351 /*
2334 * Cannot use task_function_call() because we need to run on the task's 2352 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2335 * CPU regardless of whether its current or not. 2353 * task_cpu() load, such that if the IPI then does not find the task
2354 * running, a future context switch of that task must observe the
2355 * store.
2336 */ 2356 */
2337 if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) 2357 smp_mb();
2358again:
2359 if (!task_function_call(task, __perf_install_in_context, event))
2338 return; 2360 return;
2339 2361
2340 raw_spin_lock_irq(&ctx->lock); 2362 raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2370,16 @@ again:
2348 raw_spin_unlock_irq(&ctx->lock); 2370 raw_spin_unlock_irq(&ctx->lock);
2349 return; 2371 return;
2350 } 2372 }
2351 raw_spin_unlock_irq(&ctx->lock);
2352 /* 2373 /*
2353 * Since !ctx->is_active doesn't mean anything, we must IPI 2374 * If the task is not running, ctx->lock will avoid it becoming so,
2354 * unconditionally. 2375 * thus we can safely install the event.
2355 */ 2376 */
2356 goto again; 2377 if (task_curr(task)) {
2378 raw_spin_unlock_irq(&ctx->lock);
2379 goto again;
2380 }
2381 add_event_to_ctx(event, ctx);
2382 raw_spin_unlock_irq(&ctx->lock);
2357} 2383}
2358 2384
2359/* 2385/*
@@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event)
7034 perf_output_end(&handle); 7060 perf_output_end(&handle);
7035} 7061}
7036 7062
7037/* 7063static int
7038 * Generic event overflow handling, sampling. 7064__perf_event_account_interrupt(struct perf_event *event, int throttle)
7039 */
7040
7041static int __perf_event_overflow(struct perf_event *event,
7042 int throttle, struct perf_sample_data *data,
7043 struct pt_regs *regs)
7044{ 7065{
7045 int events = atomic_read(&event->event_limit);
7046 struct hw_perf_event *hwc = &event->hw; 7066 struct hw_perf_event *hwc = &event->hw;
7047 u64 seq;
7048 int ret = 0; 7067 int ret = 0;
7049 7068 u64 seq;
7050 /*
7051 * Non-sampling counters might still use the PMI to fold short
7052 * hardware counters, ignore those.
7053 */
7054 if (unlikely(!is_sampling_event(event)))
7055 return 0;
7056 7069
7057 seq = __this_cpu_read(perf_throttled_seq); 7070 seq = __this_cpu_read(perf_throttled_seq);
7058 if (seq != hwc->interrupts_seq) { 7071 if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event,
7080 perf_adjust_period(event, delta, hwc->last_period, true); 7093 perf_adjust_period(event, delta, hwc->last_period, true);
7081 } 7094 }
7082 7095
7096 return ret;
7097}
7098
7099int perf_event_account_interrupt(struct perf_event *event)
7100{
7101 return __perf_event_account_interrupt(event, 1);
7102}
7103
7104/*
7105 * Generic event overflow handling, sampling.
7106 */
7107
7108static int __perf_event_overflow(struct perf_event *event,
7109 int throttle, struct perf_sample_data *data,
7110 struct pt_regs *regs)
7111{
7112 int events = atomic_read(&event->event_limit);
7113 int ret = 0;
7114
7115 /*
7116 * Non-sampling counters might still use the PMI to fold short
7117 * hardware counters, ignore those.
7118 */
7119 if (unlikely(!is_sampling_event(event)))
7120 return 0;
7121
7122 ret = __perf_event_account_interrupt(event, throttle);
7123
7083 /* 7124 /*
7084 * XXX event_limit might not quite work as expected on inherited 7125 * XXX event_limit might not quite work as expected on inherited
7085 * events 7126 * events
@@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9503 return 0; 9544 return 0;
9504} 9545}
9505 9546
9547/*
9548 * Variation on perf_event_ctx_lock_nested(), except we take two context
9549 * mutexes.
9550 */
9551static struct perf_event_context *
9552__perf_event_ctx_lock_double(struct perf_event *group_leader,
9553 struct perf_event_context *ctx)
9554{
9555 struct perf_event_context *gctx;
9556
9557again:
9558 rcu_read_lock();
9559 gctx = READ_ONCE(group_leader->ctx);
9560 if (!atomic_inc_not_zero(&gctx->refcount)) {
9561 rcu_read_unlock();
9562 goto again;
9563 }
9564 rcu_read_unlock();
9565
9566 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9567
9568 if (group_leader->ctx != gctx) {
9569 mutex_unlock(&ctx->mutex);
9570 mutex_unlock(&gctx->mutex);
9571 put_ctx(gctx);
9572 goto again;
9573 }
9574
9575 return gctx;
9576}
9577
9506/** 9578/**
9507 * sys_perf_event_open - open a performance event, associate it to a task/cpu 9579 * sys_perf_event_open - open a performance event, associate it to a task/cpu
9508 * 9580 *
@@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open,
9746 } 9818 }
9747 9819
9748 if (move_group) { 9820 if (move_group) {
9749 gctx = group_leader->ctx; 9821 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
9750 mutex_lock_double(&gctx->mutex, &ctx->mutex); 9822
9751 if (gctx->task == TASK_TOMBSTONE) { 9823 if (gctx->task == TASK_TOMBSTONE) {
9752 err = -ESRCH; 9824 err = -ESRCH;
9753 goto err_locked; 9825 goto err_locked;
9754 } 9826 }
9827
9828 /*
9829 * Check if we raced against another sys_perf_event_open() call
9830 * moving the software group underneath us.
9831 */
9832 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9833 /*
9834 * If someone moved the group out from under us, check
9835 * if this new event wound up on the same ctx, if so
9836 * its the regular !move_group case, otherwise fail.
9837 */
9838 if (gctx != ctx) {
9839 err = -EINVAL;
9840 goto err_locked;
9841 } else {
9842 perf_event_ctx_unlock(group_leader, gctx);
9843 move_group = 0;
9844 }
9845 }
9755 } else { 9846 } else {
9756 mutex_lock(&ctx->mutex); 9847 mutex_lock(&ctx->mutex);
9757 } 9848 }
@@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open,
9853 perf_unpin_context(ctx); 9944 perf_unpin_context(ctx);
9854 9945
9855 if (move_group) 9946 if (move_group)
9856 mutex_unlock(&gctx->mutex); 9947 perf_event_ctx_unlock(group_leader, gctx);
9857 mutex_unlock(&ctx->mutex); 9948 mutex_unlock(&ctx->mutex);
9858 9949
9859 if (task) { 9950 if (task) {
@@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open,
9879 9970
9880err_locked: 9971err_locked:
9881 if (move_group) 9972 if (move_group)
9882 mutex_unlock(&gctx->mutex); 9973 perf_event_ctx_unlock(group_leader, gctx);
9883 mutex_unlock(&ctx->mutex); 9974 mutex_unlock(&ctx->mutex);
9884/* err_file: */ 9975/* err_file: */
9885 fput(event_file); 9976 fput(event_file);
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
index 09f7fe7e5fd7..d8014065d479 100644
--- a/samples/bpf/sock_example.h
+++ b/samples/bpf/sock_example.h
@@ -4,7 +4,7 @@
4#include <unistd.h> 4#include <unistd.h>
5#include <string.h> 5#include <string.h>
6#include <errno.h> 6#include <errno.h>
7#include <net/ethernet.h> 7#include <linux/if_ether.h>
8#include <net/if.h> 8#include <net/if.h>
9#include <linux/if_packet.h> 9#include <linux/if_packet.h>
10#include <arpa/inet.h> 10#include <arpa/inet.h>
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index f4fa6af22def..ccca1e348017 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -9,7 +9,6 @@
9#include <string.h> 9#include <string.h>
10#include <fcntl.h> 10#include <fcntl.h>
11#include <poll.h> 11#include <poll.h>
12#include <sys/ioctl.h>
13#include <linux/perf_event.h> 12#include <linux/perf_event.h>
14#include <linux/bpf.h> 13#include <linux/bpf.h>
15#include <errno.h> 14#include <errno.h>
diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index 3284bb14ae78..8aad81151d50 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -213,6 +213,9 @@ static int get_value(struct parse_opt_ctx_t *p,
213 else 213 else
214 err = get_arg(p, opt, flags, (const char **)opt->value); 214 err = get_arg(p, opt, flags, (const char **)opt->value);
215 215
216 if (opt->set)
217 *(bool *)opt->set = true;
218
216 /* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */ 219 /* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
217 if (opt->flags & PARSE_OPT_NOEMPTY) { 220 if (opt->flags & PARSE_OPT_NOEMPTY) {
218 const char *val = *(const char **)opt->value; 221 const char *val = *(const char **)opt->value;
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
index 8866ac438b34..11c3be3bcce7 100644
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -137,6 +137,11 @@ struct option {
137 { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ 137 { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
138 .value = check_vtype(v, const char **), (a), .help = (h), \ 138 .value = check_vtype(v, const char **), (a), .help = (h), \
139 .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) } 139 .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
140#define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \
141 { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
142 .value = check_vtype(v, const char **), (a), .help = (h), \
143 .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \
144 .set = check_vtype(os, bool *)}
140#define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} 145#define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
141#define OPT_DATE(s, l, v, h) \ 146#define OPT_DATE(s, l, v, h) \
142 { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb } 147 { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c
index f1ce60065258..ec30c2fcbac0 100644
--- a/tools/lib/traceevent/plugin_sched_switch.c
+++ b/tools/lib/traceevent/plugin_sched_switch.c
@@ -111,7 +111,7 @@ static int sched_switch_handler(struct trace_seq *s,
111 trace_seq_printf(s, "%lld ", val); 111 trace_seq_printf(s, "%lld ", val);
112 112
113 if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0) 113 if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0)
114 trace_seq_printf(s, "[%lld] ", val); 114 trace_seq_printf(s, "[%d] ", (int) val);
115 115
116 if (pevent_get_field_val(s, event, "prev_state", record, &val, 0) == 0) 116 if (pevent_get_field_val(s, event, "prev_state", record, &val, 0) == 0)
117 write_state(s, val); 117 write_state(s, val);
@@ -129,7 +129,7 @@ static int sched_switch_handler(struct trace_seq *s,
129 trace_seq_printf(s, "%lld", val); 129 trace_seq_printf(s, "%lld", val);
130 130
131 if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0) 131 if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0)
132 trace_seq_printf(s, " [%lld]", val); 132 trace_seq_printf(s, " [%d]", (int) val);
133 133
134 return 0; 134 return 0;
135} 135}
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 27fc3617c6a4..5054d9147f0f 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -430,6 +430,10 @@ that gets then processed, possibly via a perf script, to decide if that
430particular perf.data snapshot should be kept or not. 430particular perf.data snapshot should be kept or not.
431 431
432Implies --timestamp-filename, --no-buildid and --no-buildid-cache. 432Implies --timestamp-filename, --no-buildid and --no-buildid-cache.
433The reason for the latter two is to reduce the data file switching
434overhead. You can still switch them on with:
435
436 --switch-output --no-no-buildid --no-no-buildid-cache
433 437
434--dry-run:: 438--dry-run::
435Parse options then exit. --dry-run can be used to detect errors in cmdline 439Parse options then exit. --dry-run can be used to detect errors in cmdline
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 8fc24824705e..8bb16aa9d661 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -704,9 +704,9 @@ install-tests: all install-gtk
704 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \ 704 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
705 $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' 705 $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
706 706
707install-bin: install-tools install-tests 707install-bin: install-tools install-tests install-traceevent-plugins
708 708
709install: install-bin try-install-man install-traceevent-plugins 709install: install-bin try-install-man
710 710
711install-python_ext: 711install-python_ext:
712 $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)' 712 $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 74d6a035133a..4ec10e9427d9 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1405,7 +1405,7 @@ static bool dry_run;
1405 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 1405 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1406 * using pipes, etc. 1406 * using pipes, etc.
1407 */ 1407 */
1408struct option __record_options[] = { 1408static struct option __record_options[] = {
1409 OPT_CALLBACK('e', "event", &record.evlist, "event", 1409 OPT_CALLBACK('e', "event", &record.evlist, "event",
1410 "event selector. use 'perf list' to list available events", 1410 "event selector. use 'perf list' to list available events",
1411 parse_events_option), 1411 parse_events_option),
@@ -1636,7 +1636,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1636 * overhead. Still generate buildid if they are required 1636 * overhead. Still generate buildid if they are required
1637 * explicitly using 1637 * explicitly using
1638 * 1638 *
1639 * perf record --signal-trigger --no-no-buildid \ 1639 * perf record --switch-output --no-no-buildid \
1640 * --no-no-buildid-cache 1640 * --no-no-buildid-cache
1641 * 1641 *
1642 * Following code equals to: 1642 * Following code equals to:
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index d53e706a6f17..5b134b0d1ff3 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -209,6 +209,7 @@ struct perf_sched {
209 u64 skipped_samples; 209 u64 skipped_samples;
210 const char *time_str; 210 const char *time_str;
211 struct perf_time_interval ptime; 211 struct perf_time_interval ptime;
212 struct perf_time_interval hist_time;
212}; 213};
213 214
214/* per thread run time data */ 215/* per thread run time data */
@@ -2460,6 +2461,11 @@ static int timehist_sched_change_event(struct perf_tool *tool,
2460 timehist_print_sample(sched, sample, &al, thread, t); 2461 timehist_print_sample(sched, sample, &al, thread, t);
2461 2462
2462out: 2463out:
2464 if (sched->hist_time.start == 0 && t >= ptime->start)
2465 sched->hist_time.start = t;
2466 if (ptime->end == 0 || t <= ptime->end)
2467 sched->hist_time.end = t;
2468
2463 if (tr) { 2469 if (tr) {
2464 /* time of this sched_switch event becomes last time task seen */ 2470 /* time of this sched_switch event becomes last time task seen */
2465 tr->last_time = sample->time; 2471 tr->last_time = sample->time;
@@ -2624,6 +2630,7 @@ static void timehist_print_summary(struct perf_sched *sched,
2624 struct thread *t; 2630 struct thread *t;
2625 struct thread_runtime *r; 2631 struct thread_runtime *r;
2626 int i; 2632 int i;
2633 u64 hist_time = sched->hist_time.end - sched->hist_time.start;
2627 2634
2628 memset(&totals, 0, sizeof(totals)); 2635 memset(&totals, 0, sizeof(totals));
2629 2636
@@ -2665,7 +2672,7 @@ static void timehist_print_summary(struct perf_sched *sched,
2665 totals.sched_count += r->run_stats.n; 2672 totals.sched_count += r->run_stats.n;
2666 printf(" CPU %2d idle for ", i); 2673 printf(" CPU %2d idle for ", i);
2667 print_sched_time(r->total_run_time, 6); 2674 print_sched_time(r->total_run_time, 6);
2668 printf(" msec\n"); 2675 printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
2669 } else 2676 } else
2670 printf(" CPU %2d idle entire time window\n", i); 2677 printf(" CPU %2d idle entire time window\n", i);
2671 } 2678 }
@@ -2701,12 +2708,16 @@ static void timehist_print_summary(struct perf_sched *sched,
2701 2708
2702 printf("\n" 2709 printf("\n"
2703 " Total number of unique tasks: %" PRIu64 "\n" 2710 " Total number of unique tasks: %" PRIu64 "\n"
2704 "Total number of context switches: %" PRIu64 "\n" 2711 "Total number of context switches: %" PRIu64 "\n",
2705 " Total run time (msec): ",
2706 totals.task_count, totals.sched_count); 2712 totals.task_count, totals.sched_count);
2707 2713
2714 printf(" Total run time (msec): ");
2708 print_sched_time(totals.total_run_time, 2); 2715 print_sched_time(totals.total_run_time, 2);
2709 printf("\n"); 2716 printf("\n");
2717
2718 printf(" Total scheduling time (msec): ");
2719 print_sched_time(hist_time, 2);
2720 printf(" (x %d)\n", sched->max_cpu);
2710} 2721}
2711 2722
2712typedef int (*sched_handler)(struct perf_tool *tool, 2723typedef int (*sched_handler)(struct perf_tool *tool,
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index d281ae2b54e8..4a57c8a60bd9 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -163,7 +163,7 @@ static struct map *kernel_get_module_map(const char *module)
163 163
164 /* A file path -- this is an offline module */ 164 /* A file path -- this is an offline module */
165 if (module && strchr(module, '/')) 165 if (module && strchr(module, '/'))
166 return machine__findnew_module_map(host_machine, 0, module); 166 return dso__new_map(module);
167 167
168 if (!module) 168 if (!module)
169 module = "kernel"; 169 module = "kernel";
@@ -173,6 +173,7 @@ static struct map *kernel_get_module_map(const char *module)
173 if (strncmp(pos->dso->short_name + 1, module, 173 if (strncmp(pos->dso->short_name + 1, module,
174 pos->dso->short_name_len - 2) == 0 && 174 pos->dso->short_name_len - 2) == 0 &&
175 module[pos->dso->short_name_len - 2] == '\0') { 175 module[pos->dso->short_name_len - 2] == '\0') {
176 map__get(pos);
176 return pos; 177 return pos;
177 } 178 }
178 } 179 }
@@ -188,15 +189,6 @@ struct map *get_target_map(const char *target, bool user)
188 return kernel_get_module_map(target); 189 return kernel_get_module_map(target);
189} 190}
190 191
191static void put_target_map(struct map *map, bool user)
192{
193 if (map && user) {
194 /* Only the user map needs to be released */
195 map__put(map);
196 }
197}
198
199
200static int convert_exec_to_group(const char *exec, char **result) 192static int convert_exec_to_group(const char *exec, char **result)
201{ 193{
202 char *ptr1, *ptr2, *exec_copy; 194 char *ptr1, *ptr2, *exec_copy;
@@ -268,21 +260,6 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
268} 260}
269 261
270/* 262/*
271 * NOTE:
272 * '.gnu.linkonce.this_module' section of kernel module elf directly
273 * maps to 'struct module' from linux/module.h. This section contains
274 * actual module name which will be used by kernel after loading it.
275 * But, we cannot use 'struct module' here since linux/module.h is not
276 * exposed to user-space. Offset of 'name' has remained same from long
277 * time, so hardcoding it here.
278 */
279#ifdef __LP64__
280#define MOD_NAME_OFFSET 24
281#else
282#define MOD_NAME_OFFSET 12
283#endif
284
285/*
286 * @module can be module name of module file path. In case of path, 263 * @module can be module name of module file path. In case of path,
287 * inspect elf and find out what is actual module name. 264 * inspect elf and find out what is actual module name.
288 * Caller has to free mod_name after using it. 265 * Caller has to free mod_name after using it.
@@ -296,6 +273,7 @@ static char *find_module_name(const char *module)
296 Elf_Data *data; 273 Elf_Data *data;
297 Elf_Scn *sec; 274 Elf_Scn *sec;
298 char *mod_name = NULL; 275 char *mod_name = NULL;
276 int name_offset;
299 277
300 fd = open(module, O_RDONLY); 278 fd = open(module, O_RDONLY);
301 if (fd < 0) 279 if (fd < 0)
@@ -317,7 +295,21 @@ static char *find_module_name(const char *module)
317 if (!data || !data->d_buf) 295 if (!data || !data->d_buf)
318 goto ret_err; 296 goto ret_err;
319 297
320 mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); 298 /*
299 * NOTE:
300 * '.gnu.linkonce.this_module' section of kernel module elf directly
301 * maps to 'struct module' from linux/module.h. This section contains
302 * actual module name which will be used by kernel after loading it.
303 * But, we cannot use 'struct module' here since linux/module.h is not
304 * exposed to user-space. Offset of 'name' has remained same from long
305 * time, so hardcoding it here.
306 */
307 if (ehdr.e_ident[EI_CLASS] == ELFCLASS32)
308 name_offset = 12;
309 else /* expect ELFCLASS64 by default */
310 name_offset = 24;
311
312 mod_name = strdup((char *)data->d_buf + name_offset);
321 313
322ret_err: 314ret_err:
323 elf_end(elf); 315 elf_end(elf);
@@ -412,7 +404,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
412 } 404 }
413 405
414out: 406out:
415 put_target_map(map, uprobes); 407 map__put(map);
416 return ret; 408 return ret;
417 409
418} 410}
@@ -618,6 +610,51 @@ error:
618 return ret ? : -ENOENT; 610 return ret ? : -ENOENT;
619} 611}
620 612
613/*
614 * Rename DWARF symbols to ELF symbols -- gcc sometimes optimizes functions
615 * and generate new symbols with suffixes such as .constprop.N or .isra.N
616 * etc. Since those symbols are not recorded in DWARF, we have to find
617 * correct generated symbols from offline ELF binary.
618 * For online kernel or uprobes we don't need this because those are
619 * rebased on _text, or already a section relative address.
620 */
621static int
622post_process_offline_probe_trace_events(struct probe_trace_event *tevs,
623 int ntevs, const char *pathname)
624{
625 struct symbol *sym;
626 struct map *map;
627 unsigned long stext = 0;
628 u64 addr;
629 int i;
630
631 /* Prepare a map for offline binary */
632 map = dso__new_map(pathname);
633 if (!map || get_text_start_address(pathname, &stext) < 0) {
634 pr_warning("Failed to get ELF symbols for %s\n", pathname);
635 return -EINVAL;
636 }
637
638 for (i = 0; i < ntevs; i++) {
639 addr = tevs[i].point.address + tevs[i].point.offset - stext;
640 sym = map__find_symbol(map, addr);
641 if (!sym)
642 continue;
643 if (!strcmp(sym->name, tevs[i].point.symbol))
644 continue;
645 /* If we have no realname, use symbol for it */
646 if (!tevs[i].point.realname)
647 tevs[i].point.realname = tevs[i].point.symbol;
648 else
649 free(tevs[i].point.symbol);
650 tevs[i].point.symbol = strdup(sym->name);
651 tevs[i].point.offset = addr - sym->start;
652 }
653 map__put(map);
654
655 return 0;
656}
657
621static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs, 658static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs,
622 int ntevs, const char *exec) 659 int ntevs, const char *exec)
623{ 660{
@@ -679,7 +716,8 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
679 716
680 /* Skip post process if the target is an offline kernel */ 717 /* Skip post process if the target is an offline kernel */
681 if (symbol_conf.ignore_vmlinux_buildid) 718 if (symbol_conf.ignore_vmlinux_buildid)
682 return 0; 719 return post_process_offline_probe_trace_events(tevs, ntevs,
720 symbol_conf.vmlinux_name);
683 721
684 reloc_sym = kernel_get_ref_reloc_sym(); 722 reloc_sym = kernel_get_ref_reloc_sym();
685 if (!reloc_sym) { 723 if (!reloc_sym) {
@@ -2869,7 +2907,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
2869 } 2907 }
2870 2908
2871out: 2909out:
2872 put_target_map(map, pev->uprobes); 2910 map__put(map);
2873 free(syms); 2911 free(syms);
2874 return ret; 2912 return ret;
2875 2913
@@ -3362,10 +3400,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
3362 return ret; 3400 return ret;
3363 3401
3364 /* Get a symbol map */ 3402 /* Get a symbol map */
3365 if (user) 3403 map = get_target_map(target, user);
3366 map = dso__new_map(target);
3367 else
3368 map = kernel_get_module_map(target);
3369 if (!map) { 3404 if (!map) {
3370 pr_err("Failed to get a map for %s\n", (target) ? : "kernel"); 3405 pr_err("Failed to get a map for %s\n", (target) ? : "kernel");
3371 return -EINVAL; 3406 return -EINVAL;
@@ -3397,9 +3432,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
3397 } 3432 }
3398 3433
3399end: 3434end:
3400 if (user) { 3435 map__put(map);
3401 map__put(map);
3402 }
3403 exit_probe_symbol_maps(); 3436 exit_probe_symbol_maps();
3404 3437
3405 return ret; 3438 return ret;
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 99400b0e8f2a..adbc6c02c3aa 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -537,6 +537,12 @@ int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
537 break; 537 break;
538 } else { 538 } else {
539 int n = namesz + descsz; 539 int n = namesz + descsz;
540
541 if (n > (int)sizeof(bf)) {
542 n = sizeof(bf);
543 pr_debug("%s: truncating reading of build id in sysfs file %s: n_namesz=%u, n_descsz=%u.\n",
544 __func__, filename, nhdr.n_namesz, nhdr.n_descsz);
545 }
540 if (read(fd, bf, n) != n) 546 if (read(fd, bf, n) != n)
541 break; 547 break;
542 } 548 }