diff options
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_uncore.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 20 | ||||
-rw-r--r-- | include/linux/perf_event.h | 9 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 21 | ||||
-rw-r--r-- | kernel/events/core.c | 1199 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 2 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 40 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 14 | ||||
-rw-r--r-- | tools/perf/Makefile.perf | 25 | ||||
-rw-r--r-- | tools/perf/arch/x86/tests/intel-cqm.c | 2 | ||||
-rw-r--r-- | tools/perf/config/Makefile | 4 | ||||
-rw-r--r-- | tools/perf/tests/make | 55 | ||||
-rw-r--r-- | tools/perf/ui/browsers/annotate.c | 4 | ||||
-rw-r--r-- | tools/perf/util/hist.c | 2 | ||||
-rw-r--r-- | tools/perf/util/session.c | 2 | ||||
-rw-r--r-- | tools/perf/util/stat.c | 1 | ||||
-rw-r--r-- | tools/perf/util/symbol.c | 2 |
19 files changed, 746 insertions, 667 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a667078a5180..fed2ab1f1065 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -1960,7 +1960,8 @@ intel_bts_constraints(struct perf_event *event) | |||
1960 | 1960 | ||
1961 | static int intel_alt_er(int idx, u64 config) | 1961 | static int intel_alt_er(int idx, u64 config) |
1962 | { | 1962 | { |
1963 | int alt_idx; | 1963 | int alt_idx = idx; |
1964 | |||
1964 | if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) | 1965 | if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) |
1965 | return idx; | 1966 | return idx; |
1966 | 1967 | ||
@@ -2897,14 +2898,12 @@ static void intel_pmu_cpu_starting(int cpu) | |||
2897 | return; | 2898 | return; |
2898 | 2899 | ||
2899 | if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { | 2900 | if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { |
2900 | void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; | ||
2901 | |||
2902 | for_each_cpu(i, topology_sibling_cpumask(cpu)) { | 2901 | for_each_cpu(i, topology_sibling_cpumask(cpu)) { |
2903 | struct intel_shared_regs *pc; | 2902 | struct intel_shared_regs *pc; |
2904 | 2903 | ||
2905 | pc = per_cpu(cpu_hw_events, i).shared_regs; | 2904 | pc = per_cpu(cpu_hw_events, i).shared_regs; |
2906 | if (pc && pc->core_id == core_id) { | 2905 | if (pc && pc->core_id == core_id) { |
2907 | *onln = cpuc->shared_regs; | 2906 | cpuc->kfree_on_online[0] = cpuc->shared_regs; |
2908 | cpuc->shared_regs = pc; | 2907 | cpuc->shared_regs = pc; |
2909 | break; | 2908 | break; |
2910 | } | 2909 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index f97f8075bf04..3bf41d413775 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c | |||
@@ -995,6 +995,9 @@ static int __init uncore_pci_init(void) | |||
995 | case 87: /* Knights Landing */ | 995 | case 87: /* Knights Landing */ |
996 | ret = knl_uncore_pci_init(); | 996 | ret = knl_uncore_pci_init(); |
997 | break; | 997 | break; |
998 | case 94: /* SkyLake */ | ||
999 | ret = skl_uncore_pci_init(); | ||
1000 | break; | ||
998 | default: | 1001 | default: |
999 | return 0; | 1002 | return 0; |
1000 | } | 1003 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 07aa2d6bd710..a7086b862156 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h | |||
@@ -336,6 +336,7 @@ int snb_uncore_pci_init(void); | |||
336 | int ivb_uncore_pci_init(void); | 336 | int ivb_uncore_pci_init(void); |
337 | int hsw_uncore_pci_init(void); | 337 | int hsw_uncore_pci_init(void); |
338 | int bdw_uncore_pci_init(void); | 338 | int bdw_uncore_pci_init(void); |
339 | int skl_uncore_pci_init(void); | ||
339 | void snb_uncore_cpu_init(void); | 340 | void snb_uncore_cpu_init(void); |
340 | void nhm_uncore_cpu_init(void); | 341 | void nhm_uncore_cpu_init(void); |
341 | int snb_pci2phy_map_init(int devid); | 342 | int snb_pci2phy_map_init(int devid); |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 0b934820fafd..2bd030ddd0db 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 | 8 | #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 |
9 | #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 | 9 | #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 |
10 | #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 | 10 | #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 |
11 | #define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f | ||
11 | 12 | ||
12 | /* SNB event control */ | 13 | /* SNB event control */ |
13 | #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff | 14 | #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff |
@@ -524,6 +525,14 @@ static const struct pci_device_id bdw_uncore_pci_ids[] = { | |||
524 | { /* end: all zeroes */ }, | 525 | { /* end: all zeroes */ }, |
525 | }; | 526 | }; |
526 | 527 | ||
528 | static const struct pci_device_id skl_uncore_pci_ids[] = { | ||
529 | { /* IMC */ | ||
530 | PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), | ||
531 | .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), | ||
532 | }, | ||
533 | { /* end: all zeroes */ }, | ||
534 | }; | ||
535 | |||
527 | static struct pci_driver snb_uncore_pci_driver = { | 536 | static struct pci_driver snb_uncore_pci_driver = { |
528 | .name = "snb_uncore", | 537 | .name = "snb_uncore", |
529 | .id_table = snb_uncore_pci_ids, | 538 | .id_table = snb_uncore_pci_ids, |
@@ -544,6 +553,11 @@ static struct pci_driver bdw_uncore_pci_driver = { | |||
544 | .id_table = bdw_uncore_pci_ids, | 553 | .id_table = bdw_uncore_pci_ids, |
545 | }; | 554 | }; |
546 | 555 | ||
556 | static struct pci_driver skl_uncore_pci_driver = { | ||
557 | .name = "skl_uncore", | ||
558 | .id_table = skl_uncore_pci_ids, | ||
559 | }; | ||
560 | |||
547 | struct imc_uncore_pci_dev { | 561 | struct imc_uncore_pci_dev { |
548 | __u32 pci_id; | 562 | __u32 pci_id; |
549 | struct pci_driver *driver; | 563 | struct pci_driver *driver; |
@@ -558,6 +572,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { | |||
558 | IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ | 572 | IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ |
559 | IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ | 573 | IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ |
560 | IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ | 574 | IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ |
575 | IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ | ||
561 | { /* end marker */ } | 576 | { /* end marker */ } |
562 | }; | 577 | }; |
563 | 578 | ||
@@ -610,6 +625,11 @@ int bdw_uncore_pci_init(void) | |||
610 | return imc_uncore_pci_init(); | 625 | return imc_uncore_pci_init(); |
611 | } | 626 | } |
612 | 627 | ||
628 | int skl_uncore_pci_init(void) | ||
629 | { | ||
630 | return imc_uncore_pci_init(); | ||
631 | } | ||
632 | |||
613 | /* end of Sandy Bridge uncore support */ | 633 | /* end of Sandy Bridge uncore support */ |
614 | 634 | ||
615 | /* Nehalem uncore support */ | 635 | /* Nehalem uncore support */ |
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
@@ -634,9 +634,6 @@ struct perf_event_context { | |||
634 | int nr_cgroups; /* cgroup evts */ | 634 | int nr_cgroups; /* cgroup evts */ |
635 | void *task_ctx_data; /* pmu specific data */ | 635 | void *task_ctx_data; /* pmu specific data */ |
636 | struct rcu_head rcu_head; | 636 | struct rcu_head rcu_head; |
637 | |||
638 | struct delayed_work orphans_remove; | ||
639 | bool orphans_remove_sched; | ||
640 | }; | 637 | }; |
641 | 638 | ||
642 | /* | 639 | /* |
@@ -729,7 +726,7 @@ extern int perf_event_init_task(struct task_struct *child); | |||
729 | extern void perf_event_exit_task(struct task_struct *child); | 726 | extern void perf_event_exit_task(struct task_struct *child); |
730 | extern void perf_event_free_task(struct task_struct *task); | 727 | extern void perf_event_free_task(struct task_struct *task); |
731 | extern void perf_event_delayed_put(struct task_struct *task); | 728 | extern void perf_event_delayed_put(struct task_struct *task); |
732 | extern struct perf_event *perf_event_get(unsigned int fd); | 729 | extern struct file *perf_event_get(unsigned int fd); |
733 | extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); | 730 | extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); |
734 | extern void perf_event_print_debug(void); | 731 | extern void perf_event_print_debug(void); |
735 | extern void perf_pmu_disable(struct pmu *pmu); | 732 | extern void perf_pmu_disable(struct pmu *pmu); |
@@ -1044,7 +1041,7 @@ extern void perf_swevent_put_recursion_context(int rctx); | |||
1044 | extern u64 perf_swevent_set_period(struct perf_event *event); | 1041 | extern u64 perf_swevent_set_period(struct perf_event *event); |
1045 | extern void perf_event_enable(struct perf_event *event); | 1042 | extern void perf_event_enable(struct perf_event *event); |
1046 | extern void perf_event_disable(struct perf_event *event); | 1043 | extern void perf_event_disable(struct perf_event *event); |
1047 | extern int __perf_event_disable(void *info); | 1044 | extern void perf_event_disable_local(struct perf_event *event); |
1048 | extern void perf_event_task_tick(void); | 1045 | extern void perf_event_task_tick(void); |
1049 | #else /* !CONFIG_PERF_EVENTS: */ | 1046 | #else /* !CONFIG_PERF_EVENTS: */ |
1050 | static inline void * | 1047 | static inline void * |
@@ -1070,7 +1067,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } | |||
1070 | static inline void perf_event_exit_task(struct task_struct *child) { } | 1067 | static inline void perf_event_exit_task(struct task_struct *child) { } |
1071 | static inline void perf_event_free_task(struct task_struct *task) { } | 1068 | static inline void perf_event_free_task(struct task_struct *task) { } |
1072 | static inline void perf_event_delayed_put(struct task_struct *task) { } | 1069 | static inline void perf_event_delayed_put(struct task_struct *task) { } |
1073 | static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } | 1070 | static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } |
1074 | static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) | 1071 | static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) |
1075 | { | 1072 | { |
1076 | return ERR_PTR(-EINVAL); | 1073 | return ERR_PTR(-EINVAL); |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..89ebbc4d1164 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | |||
291 | { | 291 | { |
292 | struct perf_event *event; | 292 | struct perf_event *event; |
293 | const struct perf_event_attr *attr; | 293 | const struct perf_event_attr *attr; |
294 | struct file *file; | ||
294 | 295 | ||
295 | event = perf_event_get(fd); | 296 | file = perf_event_get(fd); |
296 | if (IS_ERR(event)) | 297 | if (IS_ERR(file)) |
297 | return event; | 298 | return file; |
299 | |||
300 | event = file->private_data; | ||
298 | 301 | ||
299 | attr = perf_event_attrs(event); | 302 | attr = perf_event_attrs(event); |
300 | if (IS_ERR(attr)) | 303 | if (IS_ERR(attr)) |
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | |||
304 | goto err; | 307 | goto err; |
305 | 308 | ||
306 | if (attr->type == PERF_TYPE_RAW) | 309 | if (attr->type == PERF_TYPE_RAW) |
307 | return event; | 310 | return file; |
308 | 311 | ||
309 | if (attr->type == PERF_TYPE_HARDWARE) | 312 | if (attr->type == PERF_TYPE_HARDWARE) |
310 | return event; | 313 | return file; |
311 | 314 | ||
312 | if (attr->type == PERF_TYPE_SOFTWARE && | 315 | if (attr->type == PERF_TYPE_SOFTWARE && |
313 | attr->config == PERF_COUNT_SW_BPF_OUTPUT) | 316 | attr->config == PERF_COUNT_SW_BPF_OUTPUT) |
314 | return event; | 317 | return file; |
315 | err: | 318 | err: |
316 | perf_event_release_kernel(event); | 319 | fput(file); |
317 | return ERR_PTR(-EINVAL); | 320 | return ERR_PTR(-EINVAL); |
318 | } | 321 | } |
319 | 322 | ||
320 | static void perf_event_fd_array_put_ptr(void *ptr) | 323 | static void perf_event_fd_array_put_ptr(void *ptr) |
321 | { | 324 | { |
322 | struct perf_event *event = ptr; | 325 | fput((struct file *)ptr); |
323 | |||
324 | perf_event_release_kernel(event); | ||
325 | } | 326 | } |
326 | 327 | ||
327 | static const struct bpf_map_ops perf_event_array_ops = { | 328 | static const struct bpf_map_ops perf_event_array_ops = { |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 06ae52e99ac2..5946460b2425 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -49,8 +49,6 @@ | |||
49 | 49 | ||
50 | #include <asm/irq_regs.h> | 50 | #include <asm/irq_regs.h> |
51 | 51 | ||
52 | static struct workqueue_struct *perf_wq; | ||
53 | |||
54 | typedef int (*remote_function_f)(void *); | 52 | typedef int (*remote_function_f)(void *); |
55 | 53 | ||
56 | struct remote_function_call { | 54 | struct remote_function_call { |
@@ -126,44 +124,181 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) | |||
126 | return data.ret; | 124 | return data.ret; |
127 | } | 125 | } |
128 | 126 | ||
129 | static void event_function_call(struct perf_event *event, | 127 | static inline struct perf_cpu_context * |
130 | int (*active)(void *), | 128 | __get_cpu_context(struct perf_event_context *ctx) |
131 | void (*inactive)(void *), | 129 | { |
132 | void *data) | 130 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); |
131 | } | ||
132 | |||
133 | static void perf_ctx_lock(struct perf_cpu_context *cpuctx, | ||
134 | struct perf_event_context *ctx) | ||
135 | { | ||
136 | raw_spin_lock(&cpuctx->ctx.lock); | ||
137 | if (ctx) | ||
138 | raw_spin_lock(&ctx->lock); | ||
139 | } | ||
140 | |||
141 | static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | ||
142 | struct perf_event_context *ctx) | ||
143 | { | ||
144 | if (ctx) | ||
145 | raw_spin_unlock(&ctx->lock); | ||
146 | raw_spin_unlock(&cpuctx->ctx.lock); | ||
147 | } | ||
148 | |||
149 | #define TASK_TOMBSTONE ((void *)-1L) | ||
150 | |||
151 | static bool is_kernel_event(struct perf_event *event) | ||
152 | { | ||
153 | return READ_ONCE(event->owner) == TASK_TOMBSTONE; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * On task ctx scheduling... | ||
158 | * | ||
159 | * When !ctx->nr_events a task context will not be scheduled. This means | ||
160 | * we can disable the scheduler hooks (for performance) without leaving | ||
161 | * pending task ctx state. | ||
162 | * | ||
163 | * This however results in two special cases: | ||
164 | * | ||
165 | * - removing the last event from a task ctx; this is relatively straight | ||
166 | * forward and is done in __perf_remove_from_context. | ||
167 | * | ||
168 | * - adding the first event to a task ctx; this is tricky because we cannot | ||
169 | * rely on ctx->is_active and therefore cannot use event_function_call(). | ||
170 | * See perf_install_in_context(). | ||
171 | * | ||
172 | * This is because we need a ctx->lock serialized variable (ctx->is_active) | ||
173 | * to reliably determine if a particular task/context is scheduled in. The | ||
174 | * task_curr() use in task_function_call() is racy in that a remote context | ||
175 | * switch is not a single atomic operation. | ||
176 | * | ||
177 | * As is, the situation is 'safe' because we set rq->curr before we do the | ||
178 | * actual context switch. This means that task_curr() will fail early, but | ||
179 | * we'll continue spinning on ctx->is_active until we've passed | ||
180 | * perf_event_task_sched_out(). | ||
181 | * | ||
182 | * Without this ctx->lock serialized variable we could have race where we find | ||
183 | * the task (and hence the context) would not be active while in fact they are. | ||
184 | * | ||
185 | * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. | ||
186 | */ | ||
187 | |||
188 | typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, | ||
189 | struct perf_event_context *, void *); | ||
190 | |||
191 | struct event_function_struct { | ||
192 | struct perf_event *event; | ||
193 | event_f func; | ||
194 | void *data; | ||
195 | }; | ||
196 | |||
197 | static int event_function(void *info) | ||
198 | { | ||
199 | struct event_function_struct *efs = info; | ||
200 | struct perf_event *event = efs->event; | ||
201 | struct perf_event_context *ctx = event->ctx; | ||
202 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
203 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | ||
204 | int ret = 0; | ||
205 | |||
206 | WARN_ON_ONCE(!irqs_disabled()); | ||
207 | |||
208 | perf_ctx_lock(cpuctx, task_ctx); | ||
209 | /* | ||
210 | * Since we do the IPI call without holding ctx->lock things can have | ||
211 | * changed, double check we hit the task we set out to hit. | ||
212 | */ | ||
213 | if (ctx->task) { | ||
214 | if (ctx->task != current) { | ||
215 | ret = -EAGAIN; | ||
216 | goto unlock; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * We only use event_function_call() on established contexts, | ||
221 | * and event_function() is only ever called when active (or | ||
222 | * rather, we'll have bailed in task_function_call() or the | ||
223 | * above ctx->task != current test), therefore we must have | ||
224 | * ctx->is_active here. | ||
225 | */ | ||
226 | WARN_ON_ONCE(!ctx->is_active); | ||
227 | /* | ||
228 | * And since we have ctx->is_active, cpuctx->task_ctx must | ||
229 | * match. | ||
230 | */ | ||
231 | WARN_ON_ONCE(task_ctx != ctx); | ||
232 | } else { | ||
233 | WARN_ON_ONCE(&cpuctx->ctx != ctx); | ||
234 | } | ||
235 | |||
236 | efs->func(event, cpuctx, ctx, efs->data); | ||
237 | unlock: | ||
238 | perf_ctx_unlock(cpuctx, task_ctx); | ||
239 | |||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | static void event_function_local(struct perf_event *event, event_f func, void *data) | ||
244 | { | ||
245 | struct event_function_struct efs = { | ||
246 | .event = event, | ||
247 | .func = func, | ||
248 | .data = data, | ||
249 | }; | ||
250 | |||
251 | int ret = event_function(&efs); | ||
252 | WARN_ON_ONCE(ret); | ||
253 | } | ||
254 | |||
255 | static void event_function_call(struct perf_event *event, event_f func, void *data) | ||
133 | { | 256 | { |
134 | struct perf_event_context *ctx = event->ctx; | 257 | struct perf_event_context *ctx = event->ctx; |
135 | struct task_struct *task = ctx->task; | 258 | struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ |
259 | struct event_function_struct efs = { | ||
260 | .event = event, | ||
261 | .func = func, | ||
262 | .data = data, | ||
263 | }; | ||
264 | |||
265 | if (!event->parent) { | ||
266 | /* | ||
267 | * If this is a !child event, we must hold ctx::mutex to | ||
268 | * stabilize the the event->ctx relation. See | ||
269 | * perf_event_ctx_lock(). | ||
270 | */ | ||
271 | lockdep_assert_held(&ctx->mutex); | ||
272 | } | ||
136 | 273 | ||
137 | if (!task) { | 274 | if (!task) { |
138 | cpu_function_call(event->cpu, active, data); | 275 | cpu_function_call(event->cpu, event_function, &efs); |
139 | return; | 276 | return; |
140 | } | 277 | } |
141 | 278 | ||
142 | again: | 279 | again: |
143 | if (!task_function_call(task, active, data)) | 280 | if (task == TASK_TOMBSTONE) |
281 | return; | ||
282 | |||
283 | if (!task_function_call(task, event_function, &efs)) | ||
144 | return; | 284 | return; |
145 | 285 | ||
146 | raw_spin_lock_irq(&ctx->lock); | 286 | raw_spin_lock_irq(&ctx->lock); |
147 | if (ctx->is_active) { | 287 | /* |
148 | /* | 288 | * Reload the task pointer, it might have been changed by |
149 | * Reload the task pointer, it might have been changed by | 289 | * a concurrent perf_event_context_sched_out(). |
150 | * a concurrent perf_event_context_sched_out(). | 290 | */ |
151 | */ | 291 | task = ctx->task; |
152 | task = ctx->task; | 292 | if (task != TASK_TOMBSTONE) { |
153 | raw_spin_unlock_irq(&ctx->lock); | 293 | if (ctx->is_active) { |
154 | goto again; | 294 | raw_spin_unlock_irq(&ctx->lock); |
295 | goto again; | ||
296 | } | ||
297 | func(event, NULL, ctx, data); | ||
155 | } | 298 | } |
156 | inactive(data); | ||
157 | raw_spin_unlock_irq(&ctx->lock); | 299 | raw_spin_unlock_irq(&ctx->lock); |
158 | } | 300 | } |
159 | 301 | ||
160 | #define EVENT_OWNER_KERNEL ((void *) -1) | ||
161 | |||
162 | static bool is_kernel_event(struct perf_event *event) | ||
163 | { | ||
164 | return event->owner == EVENT_OWNER_KERNEL; | ||
165 | } | ||
166 | |||
167 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 302 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
168 | PERF_FLAG_FD_OUTPUT |\ | 303 | PERF_FLAG_FD_OUTPUT |\ |
169 | PERF_FLAG_PID_CGROUP |\ | 304 | PERF_FLAG_PID_CGROUP |\ |
@@ -368,28 +503,6 @@ static inline u64 perf_event_clock(struct perf_event *event) | |||
368 | return event->clock(); | 503 | return event->clock(); |
369 | } | 504 | } |
370 | 505 | ||
371 | static inline struct perf_cpu_context * | ||
372 | __get_cpu_context(struct perf_event_context *ctx) | ||
373 | { | ||
374 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
375 | } | ||
376 | |||
377 | static void perf_ctx_lock(struct perf_cpu_context *cpuctx, | ||
378 | struct perf_event_context *ctx) | ||
379 | { | ||
380 | raw_spin_lock(&cpuctx->ctx.lock); | ||
381 | if (ctx) | ||
382 | raw_spin_lock(&ctx->lock); | ||
383 | } | ||
384 | |||
385 | static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | ||
386 | struct perf_event_context *ctx) | ||
387 | { | ||
388 | if (ctx) | ||
389 | raw_spin_unlock(&ctx->lock); | ||
390 | raw_spin_unlock(&cpuctx->ctx.lock); | ||
391 | } | ||
392 | |||
393 | #ifdef CONFIG_CGROUP_PERF | 506 | #ifdef CONFIG_CGROUP_PERF |
394 | 507 | ||
395 | static inline bool | 508 | static inline bool |
@@ -579,13 +692,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, | |||
579 | * we are holding the rcu lock | 692 | * we are holding the rcu lock |
580 | */ | 693 | */ |
581 | cgrp1 = perf_cgroup_from_task(task, NULL); | 694 | cgrp1 = perf_cgroup_from_task(task, NULL); |
582 | 695 | cgrp2 = perf_cgroup_from_task(next, NULL); | |
583 | /* | ||
584 | * next is NULL when called from perf_event_enable_on_exec() | ||
585 | * that will systematically cause a cgroup_switch() | ||
586 | */ | ||
587 | if (next) | ||
588 | cgrp2 = perf_cgroup_from_task(next, NULL); | ||
589 | 696 | ||
590 | /* | 697 | /* |
591 | * only schedule out current cgroup events if we know | 698 | * only schedule out current cgroup events if we know |
@@ -611,8 +718,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, | |||
611 | * we are holding the rcu lock | 718 | * we are holding the rcu lock |
612 | */ | 719 | */ |
613 | cgrp1 = perf_cgroup_from_task(task, NULL); | 720 | cgrp1 = perf_cgroup_from_task(task, NULL); |
614 | |||
615 | /* prev can never be NULL */ | ||
616 | cgrp2 = perf_cgroup_from_task(prev, NULL); | 721 | cgrp2 = perf_cgroup_from_task(prev, NULL); |
617 | 722 | ||
618 | /* | 723 | /* |
@@ -917,7 +1022,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
917 | if (atomic_dec_and_test(&ctx->refcount)) { | 1022 | if (atomic_dec_and_test(&ctx->refcount)) { |
918 | if (ctx->parent_ctx) | 1023 | if (ctx->parent_ctx) |
919 | put_ctx(ctx->parent_ctx); | 1024 | put_ctx(ctx->parent_ctx); |
920 | if (ctx->task) | 1025 | if (ctx->task && ctx->task != TASK_TOMBSTONE) |
921 | put_task_struct(ctx->task); | 1026 | put_task_struct(ctx->task); |
922 | call_rcu(&ctx->rcu_head, free_ctx); | 1027 | call_rcu(&ctx->rcu_head, free_ctx); |
923 | } | 1028 | } |
@@ -934,9 +1039,8 @@ static void put_ctx(struct perf_event_context *ctx) | |||
934 | * perf_event_context::mutex nests and those are: | 1039 | * perf_event_context::mutex nests and those are: |
935 | * | 1040 | * |
936 | * - perf_event_exit_task_context() [ child , 0 ] | 1041 | * - perf_event_exit_task_context() [ child , 0 ] |
937 | * __perf_event_exit_task() | 1042 | * perf_event_exit_event() |
938 | * sync_child_event() | 1043 | * put_event() [ parent, 1 ] |
939 | * put_event() [ parent, 1 ] | ||
940 | * | 1044 | * |
941 | * - perf_event_init_context() [ parent, 0 ] | 1045 | * - perf_event_init_context() [ parent, 0 ] |
942 | * inherit_task_group() | 1046 | * inherit_task_group() |
@@ -979,8 +1083,8 @@ static void put_ctx(struct perf_event_context *ctx) | |||
979 | * Lock order: | 1083 | * Lock order: |
980 | * task_struct::perf_event_mutex | 1084 | * task_struct::perf_event_mutex |
981 | * perf_event_context::mutex | 1085 | * perf_event_context::mutex |
982 | * perf_event_context::lock | ||
983 | * perf_event::child_mutex; | 1086 | * perf_event::child_mutex; |
1087 | * perf_event_context::lock | ||
984 | * perf_event::mmap_mutex | 1088 | * perf_event::mmap_mutex |
985 | * mmap_sem | 1089 | * mmap_sem |
986 | */ | 1090 | */ |
@@ -1078,6 +1182,7 @@ static u64 primary_event_id(struct perf_event *event) | |||
1078 | 1182 | ||
1079 | /* | 1183 | /* |
1080 | * Get the perf_event_context for a task and lock it. | 1184 | * Get the perf_event_context for a task and lock it. |
1185 | * | ||
1081 | * This has to cope with with the fact that until it is locked, | 1186 | * This has to cope with with the fact that until it is locked, |
1082 | * the context could get moved to another task. | 1187 | * the context could get moved to another task. |
1083 | */ | 1188 | */ |
@@ -1118,9 +1223,12 @@ retry: | |||
1118 | goto retry; | 1223 | goto retry; |
1119 | } | 1224 | } |
1120 | 1225 | ||
1121 | if (!atomic_inc_not_zero(&ctx->refcount)) { | 1226 | if (ctx->task == TASK_TOMBSTONE || |
1227 | !atomic_inc_not_zero(&ctx->refcount)) { | ||
1122 | raw_spin_unlock(&ctx->lock); | 1228 | raw_spin_unlock(&ctx->lock); |
1123 | ctx = NULL; | 1229 | ctx = NULL; |
1230 | } else { | ||
1231 | WARN_ON_ONCE(ctx->task != task); | ||
1124 | } | 1232 | } |
1125 | } | 1233 | } |
1126 | rcu_read_unlock(); | 1234 | rcu_read_unlock(); |
@@ -1246,6 +1354,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | |||
1246 | static void | 1354 | static void |
1247 | list_add_event(struct perf_event *event, struct perf_event_context *ctx) | 1355 | list_add_event(struct perf_event *event, struct perf_event_context *ctx) |
1248 | { | 1356 | { |
1357 | lockdep_assert_held(&ctx->lock); | ||
1358 | |||
1249 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); | 1359 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); |
1250 | event->attach_state |= PERF_ATTACH_CONTEXT; | 1360 | event->attach_state |= PERF_ATTACH_CONTEXT; |
1251 | 1361 | ||
@@ -1448,11 +1558,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1448 | 1558 | ||
1449 | if (is_cgroup_event(event)) { | 1559 | if (is_cgroup_event(event)) { |
1450 | ctx->nr_cgroups--; | 1560 | ctx->nr_cgroups--; |
1561 | /* | ||
1562 | * Because cgroup events are always per-cpu events, this will | ||
1563 | * always be called from the right CPU. | ||
1564 | */ | ||
1451 | cpuctx = __get_cpu_context(ctx); | 1565 | cpuctx = __get_cpu_context(ctx); |
1452 | /* | 1566 | /* |
1453 | * if there are no more cgroup events | 1567 | * If there are no more cgroup events then clear cgrp to avoid |
1454 | * then cler cgrp to avoid stale pointer | 1568 | * stale pointer in update_cgrp_time_from_cpuctx(). |
1455 | * in update_cgrp_time_from_cpuctx() | ||
1456 | */ | 1569 | */ |
1457 | if (!ctx->nr_cgroups) | 1570 | if (!ctx->nr_cgroups) |
1458 | cpuctx->cgrp = NULL; | 1571 | cpuctx->cgrp = NULL; |
@@ -1530,45 +1643,11 @@ out: | |||
1530 | perf_event__header_size(tmp); | 1643 | perf_event__header_size(tmp); |
1531 | } | 1644 | } |
1532 | 1645 | ||
1533 | /* | ||
1534 | * User event without the task. | ||
1535 | */ | ||
1536 | static bool is_orphaned_event(struct perf_event *event) | 1646 | static bool is_orphaned_event(struct perf_event *event) |
1537 | { | 1647 | { |
1538 | return event && !is_kernel_event(event) && !event->owner; | 1648 | return event->state == PERF_EVENT_STATE_EXIT; |
1539 | } | 1649 | } |
1540 | 1650 | ||
1541 | /* | ||
1542 | * Event has a parent but parent's task finished and it's | ||
1543 | * alive only because of children holding refference. | ||
1544 | */ | ||
1545 | static bool is_orphaned_child(struct perf_event *event) | ||
1546 | { | ||
1547 | return is_orphaned_event(event->parent); | ||
1548 | } | ||
1549 | |||
1550 | static void orphans_remove_work(struct work_struct *work); | ||
1551 | |||
1552 | static void schedule_orphans_remove(struct perf_event_context *ctx) | ||
1553 | { | ||
1554 | if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) | ||
1555 | return; | ||
1556 | |||
1557 | if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { | ||
1558 | get_ctx(ctx); | ||
1559 | ctx->orphans_remove_sched = true; | ||
1560 | } | ||
1561 | } | ||
1562 | |||
1563 | static int __init perf_workqueue_init(void) | ||
1564 | { | ||
1565 | perf_wq = create_singlethread_workqueue("perf"); | ||
1566 | WARN(!perf_wq, "failed to create perf workqueue\n"); | ||
1567 | return perf_wq ? 0 : -1; | ||
1568 | } | ||
1569 | |||
1570 | core_initcall(perf_workqueue_init); | ||
1571 | |||
1572 | static inline int pmu_filter_match(struct perf_event *event) | 1651 | static inline int pmu_filter_match(struct perf_event *event) |
1573 | { | 1652 | { |
1574 | struct pmu *pmu = event->pmu; | 1653 | struct pmu *pmu = event->pmu; |
@@ -1629,9 +1708,6 @@ event_sched_out(struct perf_event *event, | |||
1629 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1708 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
1630 | cpuctx->exclusive = 0; | 1709 | cpuctx->exclusive = 0; |
1631 | 1710 | ||
1632 | if (is_orphaned_child(event)) | ||
1633 | schedule_orphans_remove(ctx); | ||
1634 | |||
1635 | perf_pmu_enable(event->pmu); | 1711 | perf_pmu_enable(event->pmu); |
1636 | } | 1712 | } |
1637 | 1713 | ||
@@ -1655,21 +1731,8 @@ group_sched_out(struct perf_event *group_event, | |||
1655 | cpuctx->exclusive = 0; | 1731 | cpuctx->exclusive = 0; |
1656 | } | 1732 | } |
1657 | 1733 | ||
1658 | struct remove_event { | 1734 | #define DETACH_GROUP 0x01UL |
1659 | struct perf_event *event; | 1735 | #define DETACH_STATE 0x02UL |
1660 | bool detach_group; | ||
1661 | }; | ||
1662 | |||
1663 | static void ___perf_remove_from_context(void *info) | ||
1664 | { | ||
1665 | struct remove_event *re = info; | ||
1666 | struct perf_event *event = re->event; | ||
1667 | struct perf_event_context *ctx = event->ctx; | ||
1668 | |||
1669 | if (re->detach_group) | ||
1670 | perf_group_detach(event); | ||
1671 | list_del_event(event, ctx); | ||
1672 | } | ||
1673 | 1736 | ||
1674 | /* | 1737 | /* |
1675 | * Cross CPU call to remove a performance event | 1738 | * Cross CPU call to remove a performance event |
@@ -1677,33 +1740,33 @@ static void ___perf_remove_from_context(void *info) | |||
1677 | * We disable the event on the hardware level first. After that we | 1740 | * We disable the event on the hardware level first. After that we |
1678 | * remove it from the context list. | 1741 | * remove it from the context list. |
1679 | */ | 1742 | */ |
1680 | static int __perf_remove_from_context(void *info) | 1743 | static void |
1744 | __perf_remove_from_context(struct perf_event *event, | ||
1745 | struct perf_cpu_context *cpuctx, | ||
1746 | struct perf_event_context *ctx, | ||
1747 | void *info) | ||
1681 | { | 1748 | { |
1682 | struct remove_event *re = info; | 1749 | unsigned long flags = (unsigned long)info; |
1683 | struct perf_event *event = re->event; | ||
1684 | struct perf_event_context *ctx = event->ctx; | ||
1685 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1686 | 1750 | ||
1687 | raw_spin_lock(&ctx->lock); | ||
1688 | event_sched_out(event, cpuctx, ctx); | 1751 | event_sched_out(event, cpuctx, ctx); |
1689 | if (re->detach_group) | 1752 | if (flags & DETACH_GROUP) |
1690 | perf_group_detach(event); | 1753 | perf_group_detach(event); |
1691 | list_del_event(event, ctx); | 1754 | list_del_event(event, ctx); |
1692 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { | 1755 | if (flags & DETACH_STATE) |
1756 | event->state = PERF_EVENT_STATE_EXIT; | ||
1757 | |||
1758 | if (!ctx->nr_events && ctx->is_active) { | ||
1693 | ctx->is_active = 0; | 1759 | ctx->is_active = 0; |
1694 | cpuctx->task_ctx = NULL; | 1760 | if (ctx->task) { |
1761 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); | ||
1762 | cpuctx->task_ctx = NULL; | ||
1763 | } | ||
1695 | } | 1764 | } |
1696 | raw_spin_unlock(&ctx->lock); | ||
1697 | |||
1698 | return 0; | ||
1699 | } | 1765 | } |
1700 | 1766 | ||
1701 | /* | 1767 | /* |
1702 | * Remove the event from a task's (or a CPU's) list of events. | 1768 | * Remove the event from a task's (or a CPU's) list of events. |
1703 | * | 1769 | * |
1704 | * CPU events are removed with a smp call. For task events we only | ||
1705 | * call when the task is on a CPU. | ||
1706 | * | ||
1707 | * If event->ctx is a cloned context, callers must make sure that | 1770 | * If event->ctx is a cloned context, callers must make sure that |
1708 | * every task struct that event->ctx->task could possibly point to | 1771 | * every task struct that event->ctx->task could possibly point to |
1709 | * remains valid. This is OK when called from perf_release since | 1772 | * remains valid. This is OK when called from perf_release since |
@@ -1711,73 +1774,32 @@ static int __perf_remove_from_context(void *info) | |||
1711 | * When called from perf_event_exit_task, it's OK because the | 1774 | * When called from perf_event_exit_task, it's OK because the |
1712 | * context has been detached from its task. | 1775 | * context has been detached from its task. |
1713 | */ | 1776 | */ |
1714 | static void perf_remove_from_context(struct perf_event *event, bool detach_group) | 1777 | static void perf_remove_from_context(struct perf_event *event, unsigned long flags) |
1715 | { | 1778 | { |
1716 | struct perf_event_context *ctx = event->ctx; | 1779 | lockdep_assert_held(&event->ctx->mutex); |
1717 | struct remove_event re = { | ||
1718 | .event = event, | ||
1719 | .detach_group = detach_group, | ||
1720 | }; | ||
1721 | 1780 | ||
1722 | lockdep_assert_held(&ctx->mutex); | 1781 | event_function_call(event, __perf_remove_from_context, (void *)flags); |
1723 | |||
1724 | event_function_call(event, __perf_remove_from_context, | ||
1725 | ___perf_remove_from_context, &re); | ||
1726 | } | 1782 | } |
1727 | 1783 | ||
1728 | /* | 1784 | /* |
1729 | * Cross CPU call to disable a performance event | 1785 | * Cross CPU call to disable a performance event |
1730 | */ | 1786 | */ |
1731 | int __perf_event_disable(void *info) | 1787 | static void __perf_event_disable(struct perf_event *event, |
1732 | { | 1788 | struct perf_cpu_context *cpuctx, |
1733 | struct perf_event *event = info; | 1789 | struct perf_event_context *ctx, |
1734 | struct perf_event_context *ctx = event->ctx; | 1790 | void *info) |
1735 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1736 | |||
1737 | /* | ||
1738 | * If this is a per-task event, need to check whether this | ||
1739 | * event's task is the current task on this cpu. | ||
1740 | * | ||
1741 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
1742 | * flipping contexts around. | ||
1743 | */ | ||
1744 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
1745 | return -EINVAL; | ||
1746 | |||
1747 | raw_spin_lock(&ctx->lock); | ||
1748 | |||
1749 | /* | ||
1750 | * If the event is on, turn it off. | ||
1751 | * If it is in error state, leave it in error state. | ||
1752 | */ | ||
1753 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | ||
1754 | update_context_time(ctx); | ||
1755 | update_cgrp_time_from_event(event); | ||
1756 | update_group_times(event); | ||
1757 | if (event == event->group_leader) | ||
1758 | group_sched_out(event, cpuctx, ctx); | ||
1759 | else | ||
1760 | event_sched_out(event, cpuctx, ctx); | ||
1761 | event->state = PERF_EVENT_STATE_OFF; | ||
1762 | } | ||
1763 | |||
1764 | raw_spin_unlock(&ctx->lock); | ||
1765 | |||
1766 | return 0; | ||
1767 | } | ||
1768 | |||
1769 | void ___perf_event_disable(void *info) | ||
1770 | { | 1791 | { |
1771 | struct perf_event *event = info; | 1792 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
1793 | return; | ||
1772 | 1794 | ||
1773 | /* | 1795 | update_context_time(ctx); |
1774 | * Since we have the lock this context can't be scheduled | 1796 | update_cgrp_time_from_event(event); |
1775 | * in, so we can change the state safely. | 1797 | update_group_times(event); |
1776 | */ | 1798 | if (event == event->group_leader) |
1777 | if (event->state == PERF_EVENT_STATE_INACTIVE) { | 1799 | group_sched_out(event, cpuctx, ctx); |
1778 | update_group_times(event); | 1800 | else |
1779 | event->state = PERF_EVENT_STATE_OFF; | 1801 | event_sched_out(event, cpuctx, ctx); |
1780 | } | 1802 | event->state = PERF_EVENT_STATE_OFF; |
1781 | } | 1803 | } |
1782 | 1804 | ||
1783 | /* | 1805 | /* |
@@ -1788,7 +1810,8 @@ void ___perf_event_disable(void *info) | |||
1788 | * remains valid. This condition is satisifed when called through | 1810 | * remains valid. This condition is satisifed when called through |
1789 | * perf_event_for_each_child or perf_event_for_each because they | 1811 | * perf_event_for_each_child or perf_event_for_each because they |
1790 | * hold the top-level event's child_mutex, so any descendant that | 1812 | * hold the top-level event's child_mutex, so any descendant that |
1791 | * goes to exit will block in sync_child_event. | 1813 | * goes to exit will block in perf_event_exit_event(). |
1814 | * | ||
1792 | * When called from perf_pending_event it's OK because event->ctx | 1815 | * When called from perf_pending_event it's OK because event->ctx |
1793 | * is the current context on this CPU and preemption is disabled, | 1816 | * is the current context on this CPU and preemption is disabled, |
1794 | * hence we can't get into perf_event_task_sched_out for this context. | 1817 | * hence we can't get into perf_event_task_sched_out for this context. |
@@ -1804,8 +1827,12 @@ static void _perf_event_disable(struct perf_event *event) | |||
1804 | } | 1827 | } |
1805 | raw_spin_unlock_irq(&ctx->lock); | 1828 | raw_spin_unlock_irq(&ctx->lock); |
1806 | 1829 | ||
1807 | event_function_call(event, __perf_event_disable, | 1830 | event_function_call(event, __perf_event_disable, NULL); |
1808 | ___perf_event_disable, event); | 1831 | } |
1832 | |||
1833 | void perf_event_disable_local(struct perf_event *event) | ||
1834 | { | ||
1835 | event_function_local(event, __perf_event_disable, NULL); | ||
1809 | } | 1836 | } |
1810 | 1837 | ||
1811 | /* | 1838 | /* |
@@ -1918,9 +1945,6 @@ event_sched_in(struct perf_event *event, | |||
1918 | if (event->attr.exclusive) | 1945 | if (event->attr.exclusive) |
1919 | cpuctx->exclusive = 1; | 1946 | cpuctx->exclusive = 1; |
1920 | 1947 | ||
1921 | if (is_orphaned_child(event)) | ||
1922 | schedule_orphans_remove(ctx); | ||
1923 | |||
1924 | out: | 1948 | out: |
1925 | perf_pmu_enable(event->pmu); | 1949 | perf_pmu_enable(event->pmu); |
1926 | 1950 | ||
@@ -2039,7 +2063,8 @@ static void add_event_to_ctx(struct perf_event *event, | |||
2039 | event->tstamp_stopped = tstamp; | 2063 | event->tstamp_stopped = tstamp; |
2040 | } | 2064 | } |
2041 | 2065 | ||
2042 | static void task_ctx_sched_out(struct perf_event_context *ctx); | 2066 | static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, |
2067 | struct perf_event_context *ctx); | ||
2043 | static void | 2068 | static void |
2044 | ctx_sched_in(struct perf_event_context *ctx, | 2069 | ctx_sched_in(struct perf_event_context *ctx, |
2045 | struct perf_cpu_context *cpuctx, | 2070 | struct perf_cpu_context *cpuctx, |
@@ -2058,16 +2083,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, | |||
2058 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); | 2083 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
2059 | } | 2084 | } |
2060 | 2085 | ||
2061 | static void ___perf_install_in_context(void *info) | 2086 | static void ctx_resched(struct perf_cpu_context *cpuctx, |
2087 | struct perf_event_context *task_ctx) | ||
2062 | { | 2088 | { |
2063 | struct perf_event *event = info; | 2089 | perf_pmu_disable(cpuctx->ctx.pmu); |
2064 | struct perf_event_context *ctx = event->ctx; | 2090 | if (task_ctx) |
2065 | 2091 | task_ctx_sched_out(cpuctx, task_ctx); | |
2066 | /* | 2092 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); |
2067 | * Since the task isn't running, its safe to add the event, us holding | 2093 | perf_event_sched_in(cpuctx, task_ctx, current); |
2068 | * the ctx->lock ensures the task won't get scheduled in. | 2094 | perf_pmu_enable(cpuctx->ctx.pmu); |
2069 | */ | ||
2070 | add_event_to_ctx(event, ctx); | ||
2071 | } | 2095 | } |
2072 | 2096 | ||
2073 | /* | 2097 | /* |
@@ -2077,55 +2101,31 @@ static void ___perf_install_in_context(void *info) | |||
2077 | */ | 2101 | */ |
2078 | static int __perf_install_in_context(void *info) | 2102 | static int __perf_install_in_context(void *info) |
2079 | { | 2103 | { |
2080 | struct perf_event *event = info; | 2104 | struct perf_event_context *ctx = info; |
2081 | struct perf_event_context *ctx = event->ctx; | ||
2082 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 2105 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
2083 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | 2106 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
2084 | struct task_struct *task = current; | ||
2085 | |||
2086 | perf_ctx_lock(cpuctx, task_ctx); | ||
2087 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
2088 | |||
2089 | /* | ||
2090 | * If there was an active task_ctx schedule it out. | ||
2091 | */ | ||
2092 | if (task_ctx) | ||
2093 | task_ctx_sched_out(task_ctx); | ||
2094 | 2107 | ||
2095 | /* | 2108 | raw_spin_lock(&cpuctx->ctx.lock); |
2096 | * If the context we're installing events in is not the | 2109 | if (ctx->task) { |
2097 | * active task_ctx, flip them. | ||
2098 | */ | ||
2099 | if (ctx->task && task_ctx != ctx) { | ||
2100 | if (task_ctx) | ||
2101 | raw_spin_unlock(&task_ctx->lock); | ||
2102 | raw_spin_lock(&ctx->lock); | 2110 | raw_spin_lock(&ctx->lock); |
2111 | /* | ||
2112 | * If we hit the 'wrong' task, we've since scheduled and | ||
2113 | * everything should be sorted, nothing to do! | ||
2114 | */ | ||
2103 | task_ctx = ctx; | 2115 | task_ctx = ctx; |
2104 | } | 2116 | if (ctx->task != current) |
2117 | goto unlock; | ||
2105 | 2118 | ||
2106 | if (task_ctx) { | 2119 | /* |
2107 | cpuctx->task_ctx = task_ctx; | 2120 | * If task_ctx is set, it had better be to us. |
2108 | task = task_ctx->task; | 2121 | */ |
2122 | WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); | ||
2123 | } else if (task_ctx) { | ||
2124 | raw_spin_lock(&task_ctx->lock); | ||
2109 | } | 2125 | } |
2110 | 2126 | ||
2111 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | 2127 | ctx_resched(cpuctx, task_ctx); |
2112 | 2128 | unlock: | |
2113 | update_context_time(ctx); | ||
2114 | /* | ||
2115 | * update cgrp time only if current cgrp | ||
2116 | * matches event->cgrp. Must be done before | ||
2117 | * calling add_event_to_ctx() | ||
2118 | */ | ||
2119 | update_cgrp_time_from_event(event); | ||
2120 | |||
2121 | add_event_to_ctx(event, ctx); | ||
2122 | |||
2123 | /* | ||
2124 | * Schedule everything back in | ||
2125 | */ | ||
2126 | perf_event_sched_in(cpuctx, task_ctx, task); | ||
2127 | |||
2128 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2129 | perf_ctx_unlock(cpuctx, task_ctx); | 2129 | perf_ctx_unlock(cpuctx, task_ctx); |
2130 | 2130 | ||
2131 | return 0; | 2131 | return 0; |
@@ -2133,27 +2133,54 @@ static int __perf_install_in_context(void *info) | |||
2133 | 2133 | ||
2134 | /* | 2134 | /* |
2135 | * Attach a performance event to a context | 2135 | * Attach a performance event to a context |
2136 | * | ||
2137 | * First we add the event to the list with the hardware enable bit | ||
2138 | * in event->hw_config cleared. | ||
2139 | * | ||
2140 | * If the event is attached to a task which is on a CPU we use a smp | ||
2141 | * call to enable it in the task context. The task might have been | ||
2142 | * scheduled away, but we check this in the smp call again. | ||
2143 | */ | 2136 | */ |
2144 | static void | 2137 | static void |
2145 | perf_install_in_context(struct perf_event_context *ctx, | 2138 | perf_install_in_context(struct perf_event_context *ctx, |
2146 | struct perf_event *event, | 2139 | struct perf_event *event, |
2147 | int cpu) | 2140 | int cpu) |
2148 | { | 2141 | { |
2142 | struct task_struct *task = NULL; | ||
2143 | |||
2149 | lockdep_assert_held(&ctx->mutex); | 2144 | lockdep_assert_held(&ctx->mutex); |
2150 | 2145 | ||
2151 | event->ctx = ctx; | 2146 | event->ctx = ctx; |
2152 | if (event->cpu != -1) | 2147 | if (event->cpu != -1) |
2153 | event->cpu = cpu; | 2148 | event->cpu = cpu; |
2154 | 2149 | ||
2155 | event_function_call(event, __perf_install_in_context, | 2150 | /* |
2156 | ___perf_install_in_context, event); | 2151 | * Installing events is tricky because we cannot rely on ctx->is_active |
2152 | * to be set in case this is the nr_events 0 -> 1 transition. | ||
2153 | * | ||
2154 | * So what we do is we add the event to the list here, which will allow | ||
2155 | * a future context switch to DTRT and then send a racy IPI. If the IPI | ||
2156 | * fails to hit the right task, this means a context switch must have | ||
2157 | * happened and that will have taken care of business. | ||
2158 | */ | ||
2159 | raw_spin_lock_irq(&ctx->lock); | ||
2160 | task = ctx->task; | ||
2161 | /* | ||
2162 | * Worse, we cannot even rely on the ctx actually existing anymore. If | ||
2163 | * between find_get_context() and perf_install_in_context() the task | ||
2164 | * went through perf_event_exit_task() its dead and we should not be | ||
2165 | * adding new events. | ||
2166 | */ | ||
2167 | if (task == TASK_TOMBSTONE) { | ||
2168 | raw_spin_unlock_irq(&ctx->lock); | ||
2169 | return; | ||
2170 | } | ||
2171 | update_context_time(ctx); | ||
2172 | /* | ||
2173 | * Update cgrp time only if current cgrp matches event->cgrp. | ||
2174 | * Must be done before calling add_event_to_ctx(). | ||
2175 | */ | ||
2176 | update_cgrp_time_from_event(event); | ||
2177 | add_event_to_ctx(event, ctx); | ||
2178 | raw_spin_unlock_irq(&ctx->lock); | ||
2179 | |||
2180 | if (task) | ||
2181 | task_function_call(task, __perf_install_in_context, ctx); | ||
2182 | else | ||
2183 | cpu_function_call(cpu, __perf_install_in_context, ctx); | ||
2157 | } | 2184 | } |
2158 | 2185 | ||
2159 | /* | 2186 | /* |
@@ -2180,43 +2207,30 @@ static void __perf_event_mark_enabled(struct perf_event *event) | |||
2180 | /* | 2207 | /* |
2181 | * Cross CPU call to enable a performance event | 2208 | * Cross CPU call to enable a performance event |
2182 | */ | 2209 | */ |
2183 | static int __perf_event_enable(void *info) | 2210 | static void __perf_event_enable(struct perf_event *event, |
2211 | struct perf_cpu_context *cpuctx, | ||
2212 | struct perf_event_context *ctx, | ||
2213 | void *info) | ||
2184 | { | 2214 | { |
2185 | struct perf_event *event = info; | ||
2186 | struct perf_event_context *ctx = event->ctx; | ||
2187 | struct perf_event *leader = event->group_leader; | 2215 | struct perf_event *leader = event->group_leader; |
2188 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 2216 | struct perf_event_context *task_ctx; |
2189 | int err; | ||
2190 | 2217 | ||
2191 | /* | 2218 | if (event->state >= PERF_EVENT_STATE_INACTIVE || |
2192 | * There's a time window between 'ctx->is_active' check | 2219 | event->state <= PERF_EVENT_STATE_ERROR) |
2193 | * in perf_event_enable function and this place having: | 2220 | return; |
2194 | * - IRQs on | ||
2195 | * - ctx->lock unlocked | ||
2196 | * | ||
2197 | * where the task could be killed and 'ctx' deactivated | ||
2198 | * by perf_event_exit_task. | ||
2199 | */ | ||
2200 | if (!ctx->is_active) | ||
2201 | return -EINVAL; | ||
2202 | 2221 | ||
2203 | raw_spin_lock(&ctx->lock); | ||
2204 | update_context_time(ctx); | 2222 | update_context_time(ctx); |
2205 | |||
2206 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | ||
2207 | goto unlock; | ||
2208 | |||
2209 | /* | ||
2210 | * set current task's cgroup time reference point | ||
2211 | */ | ||
2212 | perf_cgroup_set_timestamp(current, ctx); | ||
2213 | |||
2214 | __perf_event_mark_enabled(event); | 2223 | __perf_event_mark_enabled(event); |
2215 | 2224 | ||
2225 | if (!ctx->is_active) | ||
2226 | return; | ||
2227 | |||
2216 | if (!event_filter_match(event)) { | 2228 | if (!event_filter_match(event)) { |
2217 | if (is_cgroup_event(event)) | 2229 | if (is_cgroup_event(event)) { |
2230 | perf_cgroup_set_timestamp(current, ctx); // XXX ? | ||
2218 | perf_cgroup_defer_enabled(event); | 2231 | perf_cgroup_defer_enabled(event); |
2219 | goto unlock; | 2232 | } |
2233 | return; | ||
2220 | } | 2234 | } |
2221 | 2235 | ||
2222 | /* | 2236 | /* |
@@ -2224,41 +2238,13 @@ static int __perf_event_enable(void *info) | |||
2224 | * then don't put it on unless the group is on. | 2238 | * then don't put it on unless the group is on. |
2225 | */ | 2239 | */ |
2226 | if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) | 2240 | if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) |
2227 | goto unlock; | 2241 | return; |
2228 | |||
2229 | if (!group_can_go_on(event, cpuctx, 1)) { | ||
2230 | err = -EEXIST; | ||
2231 | } else { | ||
2232 | if (event == leader) | ||
2233 | err = group_sched_in(event, cpuctx, ctx); | ||
2234 | else | ||
2235 | err = event_sched_in(event, cpuctx, ctx); | ||
2236 | } | ||
2237 | |||
2238 | if (err) { | ||
2239 | /* | ||
2240 | * If this event can't go on and it's part of a | ||
2241 | * group, then the whole group has to come off. | ||
2242 | */ | ||
2243 | if (leader != event) { | ||
2244 | group_sched_out(leader, cpuctx, ctx); | ||
2245 | perf_mux_hrtimer_restart(cpuctx); | ||
2246 | } | ||
2247 | if (leader->attr.pinned) { | ||
2248 | update_group_times(leader); | ||
2249 | leader->state = PERF_EVENT_STATE_ERROR; | ||
2250 | } | ||
2251 | } | ||
2252 | 2242 | ||
2253 | unlock: | 2243 | task_ctx = cpuctx->task_ctx; |
2254 | raw_spin_unlock(&ctx->lock); | 2244 | if (ctx->task) |
2245 | WARN_ON_ONCE(task_ctx != ctx); | ||
2255 | 2246 | ||
2256 | return 0; | 2247 | ctx_resched(cpuctx, task_ctx); |
2257 | } | ||
2258 | |||
2259 | void ___perf_event_enable(void *info) | ||
2260 | { | ||
2261 | __perf_event_mark_enabled((struct perf_event *)info); | ||
2262 | } | 2248 | } |
2263 | 2249 | ||
2264 | /* | 2250 | /* |
@@ -2275,7 +2261,8 @@ static void _perf_event_enable(struct perf_event *event) | |||
2275 | struct perf_event_context *ctx = event->ctx; | 2261 | struct perf_event_context *ctx = event->ctx; |
2276 | 2262 | ||
2277 | raw_spin_lock_irq(&ctx->lock); | 2263 | raw_spin_lock_irq(&ctx->lock); |
2278 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 2264 | if (event->state >= PERF_EVENT_STATE_INACTIVE || |
2265 | event->state < PERF_EVENT_STATE_ERROR) { | ||
2279 | raw_spin_unlock_irq(&ctx->lock); | 2266 | raw_spin_unlock_irq(&ctx->lock); |
2280 | return; | 2267 | return; |
2281 | } | 2268 | } |
@@ -2291,8 +2278,7 @@ static void _perf_event_enable(struct perf_event *event) | |||
2291 | event->state = PERF_EVENT_STATE_OFF; | 2278 | event->state = PERF_EVENT_STATE_OFF; |
2292 | raw_spin_unlock_irq(&ctx->lock); | 2279 | raw_spin_unlock_irq(&ctx->lock); |
2293 | 2280 | ||
2294 | event_function_call(event, __perf_event_enable, | 2281 | event_function_call(event, __perf_event_enable, NULL); |
2295 | ___perf_event_enable, event); | ||
2296 | } | 2282 | } |
2297 | 2283 | ||
2298 | /* | 2284 | /* |
@@ -2342,12 +2328,27 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
2342 | struct perf_cpu_context *cpuctx, | 2328 | struct perf_cpu_context *cpuctx, |
2343 | enum event_type_t event_type) | 2329 | enum event_type_t event_type) |
2344 | { | 2330 | { |
2345 | struct perf_event *event; | ||
2346 | int is_active = ctx->is_active; | 2331 | int is_active = ctx->is_active; |
2332 | struct perf_event *event; | ||
2347 | 2333 | ||
2348 | ctx->is_active &= ~event_type; | 2334 | lockdep_assert_held(&ctx->lock); |
2349 | if (likely(!ctx->nr_events)) | 2335 | |
2336 | if (likely(!ctx->nr_events)) { | ||
2337 | /* | ||
2338 | * See __perf_remove_from_context(). | ||
2339 | */ | ||
2340 | WARN_ON_ONCE(ctx->is_active); | ||
2341 | if (ctx->task) | ||
2342 | WARN_ON_ONCE(cpuctx->task_ctx); | ||
2350 | return; | 2343 | return; |
2344 | } | ||
2345 | |||
2346 | ctx->is_active &= ~event_type; | ||
2347 | if (ctx->task) { | ||
2348 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); | ||
2349 | if (!ctx->is_active) | ||
2350 | cpuctx->task_ctx = NULL; | ||
2351 | } | ||
2351 | 2352 | ||
2352 | update_context_time(ctx); | 2353 | update_context_time(ctx); |
2353 | update_cgrp_time_from_cpuctx(cpuctx); | 2354 | update_cgrp_time_from_cpuctx(cpuctx); |
@@ -2518,17 +2519,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2518 | raw_spin_lock(&ctx->lock); | 2519 | raw_spin_lock(&ctx->lock); |
2519 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); | 2520 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); |
2520 | if (context_equiv(ctx, next_ctx)) { | 2521 | if (context_equiv(ctx, next_ctx)) { |
2521 | /* | 2522 | WRITE_ONCE(ctx->task, next); |
2522 | * XXX do we need a memory barrier of sorts | 2523 | WRITE_ONCE(next_ctx->task, task); |
2523 | * wrt to rcu_dereference() of perf_event_ctxp | ||
2524 | */ | ||
2525 | task->perf_event_ctxp[ctxn] = next_ctx; | ||
2526 | next->perf_event_ctxp[ctxn] = ctx; | ||
2527 | ctx->task = next; | ||
2528 | next_ctx->task = task; | ||
2529 | 2524 | ||
2530 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); | 2525 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); |
2531 | 2526 | ||
2527 | /* | ||
2528 | * RCU_INIT_POINTER here is safe because we've not | ||
2529 | * modified the ctx and the above modification of | ||
2530 | * ctx->task and ctx->task_ctx_data are immaterial | ||
2531 | * since those values are always verified under | ||
2532 | * ctx->lock which we're now holding. | ||
2533 | */ | ||
2534 | RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); | ||
2535 | RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); | ||
2536 | |||
2532 | do_switch = 0; | 2537 | do_switch = 0; |
2533 | 2538 | ||
2534 | perf_event_sync_stat(ctx, next_ctx); | 2539 | perf_event_sync_stat(ctx, next_ctx); |
@@ -2541,8 +2546,7 @@ unlock: | |||
2541 | 2546 | ||
2542 | if (do_switch) { | 2547 | if (do_switch) { |
2543 | raw_spin_lock(&ctx->lock); | 2548 | raw_spin_lock(&ctx->lock); |
2544 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | 2549 | task_ctx_sched_out(cpuctx, ctx); |
2545 | cpuctx->task_ctx = NULL; | ||
2546 | raw_spin_unlock(&ctx->lock); | 2550 | raw_spin_unlock(&ctx->lock); |
2547 | } | 2551 | } |
2548 | } | 2552 | } |
@@ -2637,10 +2641,9 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
2637 | perf_cgroup_sched_out(task, next); | 2641 | perf_cgroup_sched_out(task, next); |
2638 | } | 2642 | } |
2639 | 2643 | ||
2640 | static void task_ctx_sched_out(struct perf_event_context *ctx) | 2644 | static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, |
2645 | struct perf_event_context *ctx) | ||
2641 | { | 2646 | { |
2642 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
2643 | |||
2644 | if (!cpuctx->task_ctx) | 2647 | if (!cpuctx->task_ctx) |
2645 | return; | 2648 | return; |
2646 | 2649 | ||
@@ -2648,7 +2651,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx) | |||
2648 | return; | 2651 | return; |
2649 | 2652 | ||
2650 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | 2653 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
2651 | cpuctx->task_ctx = NULL; | ||
2652 | } | 2654 | } |
2653 | 2655 | ||
2654 | /* | 2656 | /* |
@@ -2725,13 +2727,22 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
2725 | enum event_type_t event_type, | 2727 | enum event_type_t event_type, |
2726 | struct task_struct *task) | 2728 | struct task_struct *task) |
2727 | { | 2729 | { |
2728 | u64 now; | ||
2729 | int is_active = ctx->is_active; | 2730 | int is_active = ctx->is_active; |
2731 | u64 now; | ||
2732 | |||
2733 | lockdep_assert_held(&ctx->lock); | ||
2730 | 2734 | ||
2731 | ctx->is_active |= event_type; | ||
2732 | if (likely(!ctx->nr_events)) | 2735 | if (likely(!ctx->nr_events)) |
2733 | return; | 2736 | return; |
2734 | 2737 | ||
2738 | ctx->is_active |= event_type; | ||
2739 | if (ctx->task) { | ||
2740 | if (!is_active) | ||
2741 | cpuctx->task_ctx = ctx; | ||
2742 | else | ||
2743 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); | ||
2744 | } | ||
2745 | |||
2735 | now = perf_clock(); | 2746 | now = perf_clock(); |
2736 | ctx->timestamp = now; | 2747 | ctx->timestamp = now; |
2737 | perf_cgroup_set_timestamp(task, ctx); | 2748 | perf_cgroup_set_timestamp(task, ctx); |
@@ -2773,12 +2784,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2773 | * cpu flexible, task flexible. | 2784 | * cpu flexible, task flexible. |
2774 | */ | 2785 | */ |
2775 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2786 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2776 | 2787 | perf_event_sched_in(cpuctx, ctx, task); | |
2777 | if (ctx->nr_events) | ||
2778 | cpuctx->task_ctx = ctx; | ||
2779 | |||
2780 | perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); | ||
2781 | |||
2782 | perf_pmu_enable(ctx->pmu); | 2788 | perf_pmu_enable(ctx->pmu); |
2783 | perf_ctx_unlock(cpuctx, ctx); | 2789 | perf_ctx_unlock(cpuctx, ctx); |
2784 | } | 2790 | } |
@@ -2800,6 +2806,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2800 | struct perf_event_context *ctx; | 2806 | struct perf_event_context *ctx; |
2801 | int ctxn; | 2807 | int ctxn; |
2802 | 2808 | ||
2809 | /* | ||
2810 | * If cgroup events exist on this CPU, then we need to check if we have | ||
2811 | * to switch in PMU state; cgroup event are system-wide mode only. | ||
2812 | * | ||
2813 | * Since cgroup events are CPU events, we must schedule these in before | ||
2814 | * we schedule in the task events. | ||
2815 | */ | ||
2816 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | ||
2817 | perf_cgroup_sched_in(prev, task); | ||
2818 | |||
2803 | for_each_task_context_nr(ctxn) { | 2819 | for_each_task_context_nr(ctxn) { |
2804 | ctx = task->perf_event_ctxp[ctxn]; | 2820 | ctx = task->perf_event_ctxp[ctxn]; |
2805 | if (likely(!ctx)) | 2821 | if (likely(!ctx)) |
@@ -2807,13 +2823,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2807 | 2823 | ||
2808 | perf_event_context_sched_in(ctx, task); | 2824 | perf_event_context_sched_in(ctx, task); |
2809 | } | 2825 | } |
2810 | /* | ||
2811 | * if cgroup events exist on this CPU, then we need | ||
2812 | * to check if we have to switch in PMU state. | ||
2813 | * cgroup event are system-wide mode only | ||
2814 | */ | ||
2815 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | ||
2816 | perf_cgroup_sched_in(prev, task); | ||
2817 | 2826 | ||
2818 | if (atomic_read(&nr_switch_events)) | 2827 | if (atomic_read(&nr_switch_events)) |
2819 | perf_event_switch(task, prev, true); | 2828 | perf_event_switch(task, prev, true); |
@@ -3099,46 +3108,30 @@ static int event_enable_on_exec(struct perf_event *event, | |||
3099 | static void perf_event_enable_on_exec(int ctxn) | 3108 | static void perf_event_enable_on_exec(int ctxn) |
3100 | { | 3109 | { |
3101 | struct perf_event_context *ctx, *clone_ctx = NULL; | 3110 | struct perf_event_context *ctx, *clone_ctx = NULL; |
3111 | struct perf_cpu_context *cpuctx; | ||
3102 | struct perf_event *event; | 3112 | struct perf_event *event; |
3103 | unsigned long flags; | 3113 | unsigned long flags; |
3104 | int enabled = 0; | 3114 | int enabled = 0; |
3105 | int ret; | ||
3106 | 3115 | ||
3107 | local_irq_save(flags); | 3116 | local_irq_save(flags); |
3108 | ctx = current->perf_event_ctxp[ctxn]; | 3117 | ctx = current->perf_event_ctxp[ctxn]; |
3109 | if (!ctx || !ctx->nr_events) | 3118 | if (!ctx || !ctx->nr_events) |
3110 | goto out; | 3119 | goto out; |
3111 | 3120 | ||
3112 | /* | 3121 | cpuctx = __get_cpu_context(ctx); |
3113 | * We must ctxsw out cgroup events to avoid conflict | 3122 | perf_ctx_lock(cpuctx, ctx); |
3114 | * when invoking perf_task_event_sched_in() later on | 3123 | list_for_each_entry(event, &ctx->event_list, event_entry) |
3115 | * in this function. Otherwise we end up trying to | 3124 | enabled |= event_enable_on_exec(event, ctx); |
3116 | * ctxswin cgroup events which are already scheduled | ||
3117 | * in. | ||
3118 | */ | ||
3119 | perf_cgroup_sched_out(current, NULL); | ||
3120 | |||
3121 | raw_spin_lock(&ctx->lock); | ||
3122 | task_ctx_sched_out(ctx); | ||
3123 | |||
3124 | list_for_each_entry(event, &ctx->event_list, event_entry) { | ||
3125 | ret = event_enable_on_exec(event, ctx); | ||
3126 | if (ret) | ||
3127 | enabled = 1; | ||
3128 | } | ||
3129 | 3125 | ||
3130 | /* | 3126 | /* |
3131 | * Unclone this context if we enabled any event. | 3127 | * Unclone and reschedule this context if we enabled any event. |
3132 | */ | 3128 | */ |
3133 | if (enabled) | 3129 | if (enabled) { |
3134 | clone_ctx = unclone_ctx(ctx); | 3130 | clone_ctx = unclone_ctx(ctx); |
3131 | ctx_resched(cpuctx, ctx); | ||
3132 | } | ||
3133 | perf_ctx_unlock(cpuctx, ctx); | ||
3135 | 3134 | ||
3136 | raw_spin_unlock(&ctx->lock); | ||
3137 | |||
3138 | /* | ||
3139 | * Also calls ctxswin for cgroup events, if any: | ||
3140 | */ | ||
3141 | perf_event_context_sched_in(ctx, ctx->task); | ||
3142 | out: | 3135 | out: |
3143 | local_irq_restore(flags); | 3136 | local_irq_restore(flags); |
3144 | 3137 | ||
@@ -3334,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) | |||
3334 | INIT_LIST_HEAD(&ctx->flexible_groups); | 3327 | INIT_LIST_HEAD(&ctx->flexible_groups); |
3335 | INIT_LIST_HEAD(&ctx->event_list); | 3328 | INIT_LIST_HEAD(&ctx->event_list); |
3336 | atomic_set(&ctx->refcount, 1); | 3329 | atomic_set(&ctx->refcount, 1); |
3337 | INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); | ||
3338 | } | 3330 | } |
3339 | 3331 | ||
3340 | static struct perf_event_context * | 3332 | static struct perf_event_context * |
@@ -3521,11 +3513,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) | |||
3521 | 3513 | ||
3522 | static void unaccount_event(struct perf_event *event) | 3514 | static void unaccount_event(struct perf_event *event) |
3523 | { | 3515 | { |
3516 | bool dec = false; | ||
3517 | |||
3524 | if (event->parent) | 3518 | if (event->parent) |
3525 | return; | 3519 | return; |
3526 | 3520 | ||
3527 | if (event->attach_state & PERF_ATTACH_TASK) | 3521 | if (event->attach_state & PERF_ATTACH_TASK) |
3528 | static_key_slow_dec_deferred(&perf_sched_events); | 3522 | dec = true; |
3529 | if (event->attr.mmap || event->attr.mmap_data) | 3523 | if (event->attr.mmap || event->attr.mmap_data) |
3530 | atomic_dec(&nr_mmap_events); | 3524 | atomic_dec(&nr_mmap_events); |
3531 | if (event->attr.comm) | 3525 | if (event->attr.comm) |
@@ -3535,12 +3529,15 @@ static void unaccount_event(struct perf_event *event) | |||
3535 | if (event->attr.freq) | 3529 | if (event->attr.freq) |
3536 | atomic_dec(&nr_freq_events); | 3530 | atomic_dec(&nr_freq_events); |
3537 | if (event->attr.context_switch) { | 3531 | if (event->attr.context_switch) { |
3538 | static_key_slow_dec_deferred(&perf_sched_events); | 3532 | dec = true; |
3539 | atomic_dec(&nr_switch_events); | 3533 | atomic_dec(&nr_switch_events); |
3540 | } | 3534 | } |
3541 | if (is_cgroup_event(event)) | 3535 | if (is_cgroup_event(event)) |
3542 | static_key_slow_dec_deferred(&perf_sched_events); | 3536 | dec = true; |
3543 | if (has_branch_stack(event)) | 3537 | if (has_branch_stack(event)) |
3538 | dec = true; | ||
3539 | |||
3540 | if (dec) | ||
3544 | static_key_slow_dec_deferred(&perf_sched_events); | 3541 | static_key_slow_dec_deferred(&perf_sched_events); |
3545 | 3542 | ||
3546 | unaccount_event_cpu(event, event->cpu); | 3543 | unaccount_event_cpu(event, event->cpu); |
@@ -3556,7 +3553,7 @@ static void unaccount_event(struct perf_event *event) | |||
3556 | * 3) two matching events on the same context. | 3553 | * 3) two matching events on the same context. |
3557 | * | 3554 | * |
3558 | * The former two cases are handled in the allocation path (perf_event_alloc(), | 3555 | * The former two cases are handled in the allocation path (perf_event_alloc(), |
3559 | * __free_event()), the latter -- before the first perf_install_in_context(). | 3556 | * _free_event()), the latter -- before the first perf_install_in_context(). |
3560 | */ | 3557 | */ |
3561 | static int exclusive_event_init(struct perf_event *event) | 3558 | static int exclusive_event_init(struct perf_event *event) |
3562 | { | 3559 | { |
@@ -3631,29 +3628,6 @@ static bool exclusive_event_installable(struct perf_event *event, | |||
3631 | return true; | 3628 | return true; |
3632 | } | 3629 | } |
3633 | 3630 | ||
3634 | static void __free_event(struct perf_event *event) | ||
3635 | { | ||
3636 | if (!event->parent) { | ||
3637 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
3638 | put_callchain_buffers(); | ||
3639 | } | ||
3640 | |||
3641 | perf_event_free_bpf_prog(event); | ||
3642 | |||
3643 | if (event->destroy) | ||
3644 | event->destroy(event); | ||
3645 | |||
3646 | if (event->ctx) | ||
3647 | put_ctx(event->ctx); | ||
3648 | |||
3649 | if (event->pmu) { | ||
3650 | exclusive_event_destroy(event); | ||
3651 | module_put(event->pmu->module); | ||
3652 | } | ||
3653 | |||
3654 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3655 | } | ||
3656 | |||
3657 | static void _free_event(struct perf_event *event) | 3631 | static void _free_event(struct perf_event *event) |
3658 | { | 3632 | { |
3659 | irq_work_sync(&event->pending); | 3633 | irq_work_sync(&event->pending); |
@@ -3675,7 +3649,25 @@ static void _free_event(struct perf_event *event) | |||
3675 | if (is_cgroup_event(event)) | 3649 | if (is_cgroup_event(event)) |
3676 | perf_detach_cgroup(event); | 3650 | perf_detach_cgroup(event); |
3677 | 3651 | ||
3678 | __free_event(event); | 3652 | if (!event->parent) { |
3653 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
3654 | put_callchain_buffers(); | ||
3655 | } | ||
3656 | |||
3657 | perf_event_free_bpf_prog(event); | ||
3658 | |||
3659 | if (event->destroy) | ||
3660 | event->destroy(event); | ||
3661 | |||
3662 | if (event->ctx) | ||
3663 | put_ctx(event->ctx); | ||
3664 | |||
3665 | if (event->pmu) { | ||
3666 | exclusive_event_destroy(event); | ||
3667 | module_put(event->pmu->module); | ||
3668 | } | ||
3669 | |||
3670 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3679 | } | 3671 | } |
3680 | 3672 | ||
3681 | /* | 3673 | /* |
@@ -3702,14 +3694,13 @@ static void perf_remove_from_owner(struct perf_event *event) | |||
3702 | struct task_struct *owner; | 3694 | struct task_struct *owner; |
3703 | 3695 | ||
3704 | rcu_read_lock(); | 3696 | rcu_read_lock(); |
3705 | owner = ACCESS_ONCE(event->owner); | ||
3706 | /* | 3697 | /* |
3707 | * Matches the smp_wmb() in perf_event_exit_task(). If we observe | 3698 | * Matches the smp_store_release() in perf_event_exit_task(). If we |
3708 | * !owner it means the list deletion is complete and we can indeed | 3699 | * observe !owner it means the list deletion is complete and we can |
3709 | * free this event, otherwise we need to serialize on | 3700 | * indeed free this event, otherwise we need to serialize on |
3710 | * owner->perf_event_mutex. | 3701 | * owner->perf_event_mutex. |
3711 | */ | 3702 | */ |
3712 | smp_read_barrier_depends(); | 3703 | owner = lockless_dereference(event->owner); |
3713 | if (owner) { | 3704 | if (owner) { |
3714 | /* | 3705 | /* |
3715 | * Since delayed_put_task_struct() also drops the last | 3706 | * Since delayed_put_task_struct() also drops the last |
@@ -3737,8 +3728,10 @@ static void perf_remove_from_owner(struct perf_event *event) | |||
3737 | * ensured they're done, and we can proceed with freeing the | 3728 | * ensured they're done, and we can proceed with freeing the |
3738 | * event. | 3729 | * event. |
3739 | */ | 3730 | */ |
3740 | if (event->owner) | 3731 | if (event->owner) { |
3741 | list_del_init(&event->owner_entry); | 3732 | list_del_init(&event->owner_entry); |
3733 | smp_store_release(&event->owner, NULL); | ||
3734 | } | ||
3742 | mutex_unlock(&owner->perf_event_mutex); | 3735 | mutex_unlock(&owner->perf_event_mutex); |
3743 | put_task_struct(owner); | 3736 | put_task_struct(owner); |
3744 | } | 3737 | } |
@@ -3746,36 +3739,98 @@ static void perf_remove_from_owner(struct perf_event *event) | |||
3746 | 3739 | ||
3747 | static void put_event(struct perf_event *event) | 3740 | static void put_event(struct perf_event *event) |
3748 | { | 3741 | { |
3749 | struct perf_event_context *ctx; | ||
3750 | |||
3751 | if (!atomic_long_dec_and_test(&event->refcount)) | 3742 | if (!atomic_long_dec_and_test(&event->refcount)) |
3752 | return; | 3743 | return; |
3753 | 3744 | ||
3745 | _free_event(event); | ||
3746 | } | ||
3747 | |||
3748 | /* | ||
3749 | * Kill an event dead; while event:refcount will preserve the event | ||
3750 | * object, it will not preserve its functionality. Once the last 'user' | ||
3751 | * gives up the object, we'll destroy the thing. | ||
3752 | */ | ||
3753 | int perf_event_release_kernel(struct perf_event *event) | ||
3754 | { | ||
3755 | struct perf_event_context *ctx; | ||
3756 | struct perf_event *child, *tmp; | ||
3757 | |||
3754 | if (!is_kernel_event(event)) | 3758 | if (!is_kernel_event(event)) |
3755 | perf_remove_from_owner(event); | 3759 | perf_remove_from_owner(event); |
3756 | 3760 | ||
3761 | ctx = perf_event_ctx_lock(event); | ||
3762 | WARN_ON_ONCE(ctx->parent_ctx); | ||
3763 | perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); | ||
3764 | perf_event_ctx_unlock(event, ctx); | ||
3765 | |||
3757 | /* | 3766 | /* |
3758 | * There are two ways this annotation is useful: | 3767 | * At this point we must have event->state == PERF_EVENT_STATE_EXIT, |
3768 | * either from the above perf_remove_from_context() or through | ||
3769 | * perf_event_exit_event(). | ||
3759 | * | 3770 | * |
3760 | * 1) there is a lock recursion from perf_event_exit_task | 3771 | * Therefore, anybody acquiring event->child_mutex after the below |
3761 | * see the comment there. | 3772 | * loop _must_ also see this, most importantly inherit_event() which |
3773 | * will avoid placing more children on the list. | ||
3762 | * | 3774 | * |
3763 | * 2) there is a lock-inversion with mmap_sem through | 3775 | * Thus this guarantees that we will in fact observe and kill _ALL_ |
3764 | * perf_read_group(), which takes faults while | 3776 | * child events. |
3765 | * holding ctx->mutex, however this is called after | ||
3766 | * the last filedesc died, so there is no possibility | ||
3767 | * to trigger the AB-BA case. | ||
3768 | */ | 3777 | */ |
3769 | ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); | 3778 | WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); |
3770 | WARN_ON_ONCE(ctx->parent_ctx); | ||
3771 | perf_remove_from_context(event, true); | ||
3772 | perf_event_ctx_unlock(event, ctx); | ||
3773 | 3779 | ||
3774 | _free_event(event); | 3780 | again: |
3775 | } | 3781 | mutex_lock(&event->child_mutex); |
3782 | list_for_each_entry(child, &event->child_list, child_list) { | ||
3776 | 3783 | ||
3777 | int perf_event_release_kernel(struct perf_event *event) | 3784 | /* |
3778 | { | 3785 | * Cannot change, child events are not migrated, see the |
3786 | * comment with perf_event_ctx_lock_nested(). | ||
3787 | */ | ||
3788 | ctx = lockless_dereference(child->ctx); | ||
3789 | /* | ||
3790 | * Since child_mutex nests inside ctx::mutex, we must jump | ||
3791 | * through hoops. We start by grabbing a reference on the ctx. | ||
3792 | * | ||
3793 | * Since the event cannot get freed while we hold the | ||
3794 | * child_mutex, the context must also exist and have a !0 | ||
3795 | * reference count. | ||
3796 | */ | ||
3797 | get_ctx(ctx); | ||
3798 | |||
3799 | /* | ||
3800 | * Now that we have a ctx ref, we can drop child_mutex, and | ||
3801 | * acquire ctx::mutex without fear of it going away. Then we | ||
3802 | * can re-acquire child_mutex. | ||
3803 | */ | ||
3804 | mutex_unlock(&event->child_mutex); | ||
3805 | mutex_lock(&ctx->mutex); | ||
3806 | mutex_lock(&event->child_mutex); | ||
3807 | |||
3808 | /* | ||
3809 | * Now that we hold ctx::mutex and child_mutex, revalidate our | ||
3810 | * state, if child is still the first entry, it didn't get freed | ||
3811 | * and we can continue doing so. | ||
3812 | */ | ||
3813 | tmp = list_first_entry_or_null(&event->child_list, | ||
3814 | struct perf_event, child_list); | ||
3815 | if (tmp == child) { | ||
3816 | perf_remove_from_context(child, DETACH_GROUP); | ||
3817 | list_del(&child->child_list); | ||
3818 | free_event(child); | ||
3819 | /* | ||
3820 | * This matches the refcount bump in inherit_event(); | ||
3821 | * this can't be the last reference. | ||
3822 | */ | ||
3823 | put_event(event); | ||
3824 | } | ||
3825 | |||
3826 | mutex_unlock(&event->child_mutex); | ||
3827 | mutex_unlock(&ctx->mutex); | ||
3828 | put_ctx(ctx); | ||
3829 | goto again; | ||
3830 | } | ||
3831 | mutex_unlock(&event->child_mutex); | ||
3832 | |||
3833 | /* Must be the last reference */ | ||
3779 | put_event(event); | 3834 | put_event(event); |
3780 | return 0; | 3835 | return 0; |
3781 | } | 3836 | } |
@@ -3786,46 +3841,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
3786 | */ | 3841 | */ |
3787 | static int perf_release(struct inode *inode, struct file *file) | 3842 | static int perf_release(struct inode *inode, struct file *file) |
3788 | { | 3843 | { |
3789 | put_event(file->private_data); | 3844 | perf_event_release_kernel(file->private_data); |
3790 | return 0; | 3845 | return 0; |
3791 | } | 3846 | } |
3792 | 3847 | ||
3793 | /* | ||
3794 | * Remove all orphanes events from the context. | ||
3795 | */ | ||
3796 | static void orphans_remove_work(struct work_struct *work) | ||
3797 | { | ||
3798 | struct perf_event_context *ctx; | ||
3799 | struct perf_event *event, *tmp; | ||
3800 | |||
3801 | ctx = container_of(work, struct perf_event_context, | ||
3802 | orphans_remove.work); | ||
3803 | |||
3804 | mutex_lock(&ctx->mutex); | ||
3805 | list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { | ||
3806 | struct perf_event *parent_event = event->parent; | ||
3807 | |||
3808 | if (!is_orphaned_child(event)) | ||
3809 | continue; | ||
3810 | |||
3811 | perf_remove_from_context(event, true); | ||
3812 | |||
3813 | mutex_lock(&parent_event->child_mutex); | ||
3814 | list_del_init(&event->child_list); | ||
3815 | mutex_unlock(&parent_event->child_mutex); | ||
3816 | |||
3817 | free_event(event); | ||
3818 | put_event(parent_event); | ||
3819 | } | ||
3820 | |||
3821 | raw_spin_lock_irq(&ctx->lock); | ||
3822 | ctx->orphans_remove_sched = false; | ||
3823 | raw_spin_unlock_irq(&ctx->lock); | ||
3824 | mutex_unlock(&ctx->mutex); | ||
3825 | |||
3826 | put_ctx(ctx); | ||
3827 | } | ||
3828 | |||
3829 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 3848 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
3830 | { | 3849 | { |
3831 | struct perf_event *child; | 3850 | struct perf_event *child; |
@@ -4054,7 +4073,7 @@ static void _perf_event_reset(struct perf_event *event) | |||
4054 | /* | 4073 | /* |
4055 | * Holding the top-level event's child_mutex means that any | 4074 | * Holding the top-level event's child_mutex means that any |
4056 | * descendant process that has inherited this event will block | 4075 | * descendant process that has inherited this event will block |
4057 | * in sync_child_event if it goes to exit, thus satisfying the | 4076 | * in perf_event_exit_event() if it goes to exit, thus satisfying the |
4058 | * task existence requirements of perf_event_enable/disable. | 4077 | * task existence requirements of perf_event_enable/disable. |
4059 | */ | 4078 | */ |
4060 | static void perf_event_for_each_child(struct perf_event *event, | 4079 | static void perf_event_for_each_child(struct perf_event *event, |
@@ -4086,36 +4105,14 @@ static void perf_event_for_each(struct perf_event *event, | |||
4086 | perf_event_for_each_child(sibling, func); | 4105 | perf_event_for_each_child(sibling, func); |
4087 | } | 4106 | } |
4088 | 4107 | ||
4089 | struct period_event { | 4108 | static void __perf_event_period(struct perf_event *event, |
4090 | struct perf_event *event; | 4109 | struct perf_cpu_context *cpuctx, |
4091 | u64 value; | 4110 | struct perf_event_context *ctx, |
4092 | }; | 4111 | void *info) |
4093 | |||
4094 | static void ___perf_event_period(void *info) | ||
4095 | { | ||
4096 | struct period_event *pe = info; | ||
4097 | struct perf_event *event = pe->event; | ||
4098 | u64 value = pe->value; | ||
4099 | |||
4100 | if (event->attr.freq) { | ||
4101 | event->attr.sample_freq = value; | ||
4102 | } else { | ||
4103 | event->attr.sample_period = value; | ||
4104 | event->hw.sample_period = value; | ||
4105 | } | ||
4106 | |||
4107 | local64_set(&event->hw.period_left, 0); | ||
4108 | } | ||
4109 | |||
4110 | static int __perf_event_period(void *info) | ||
4111 | { | 4112 | { |
4112 | struct period_event *pe = info; | 4113 | u64 value = *((u64 *)info); |
4113 | struct perf_event *event = pe->event; | ||
4114 | struct perf_event_context *ctx = event->ctx; | ||
4115 | u64 value = pe->value; | ||
4116 | bool active; | 4114 | bool active; |
4117 | 4115 | ||
4118 | raw_spin_lock(&ctx->lock); | ||
4119 | if (event->attr.freq) { | 4116 | if (event->attr.freq) { |
4120 | event->attr.sample_freq = value; | 4117 | event->attr.sample_freq = value; |
4121 | } else { | 4118 | } else { |
@@ -4135,14 +4132,10 @@ static int __perf_event_period(void *info) | |||
4135 | event->pmu->start(event, PERF_EF_RELOAD); | 4132 | event->pmu->start(event, PERF_EF_RELOAD); |
4136 | perf_pmu_enable(ctx->pmu); | 4133 | perf_pmu_enable(ctx->pmu); |
4137 | } | 4134 | } |
4138 | raw_spin_unlock(&ctx->lock); | ||
4139 | |||
4140 | return 0; | ||
4141 | } | 4135 | } |
4142 | 4136 | ||
4143 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 4137 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
4144 | { | 4138 | { |
4145 | struct period_event pe = { .event = event, }; | ||
4146 | u64 value; | 4139 | u64 value; |
4147 | 4140 | ||
4148 | if (!is_sampling_event(event)) | 4141 | if (!is_sampling_event(event)) |
@@ -4157,10 +4150,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
4157 | if (event->attr.freq && value > sysctl_perf_event_sample_rate) | 4150 | if (event->attr.freq && value > sysctl_perf_event_sample_rate) |
4158 | return -EINVAL; | 4151 | return -EINVAL; |
4159 | 4152 | ||
4160 | pe.value = value; | 4153 | event_function_call(event, __perf_event_period, &value); |
4161 | |||
4162 | event_function_call(event, __perf_event_period, | ||
4163 | ___perf_event_period, &pe); | ||
4164 | 4154 | ||
4165 | return 0; | 4155 | return 0; |
4166 | } | 4156 | } |
@@ -4932,7 +4922,7 @@ static void perf_pending_event(struct irq_work *entry) | |||
4932 | 4922 | ||
4933 | if (event->pending_disable) { | 4923 | if (event->pending_disable) { |
4934 | event->pending_disable = 0; | 4924 | event->pending_disable = 0; |
4935 | __perf_event_disable(event); | 4925 | perf_event_disable_local(event); |
4936 | } | 4926 | } |
4937 | 4927 | ||
4938 | if (event->pending_wakeup) { | 4928 | if (event->pending_wakeup) { |
@@ -7753,11 +7743,13 @@ static void account_event_cpu(struct perf_event *event, int cpu) | |||
7753 | 7743 | ||
7754 | static void account_event(struct perf_event *event) | 7744 | static void account_event(struct perf_event *event) |
7755 | { | 7745 | { |
7746 | bool inc = false; | ||
7747 | |||
7756 | if (event->parent) | 7748 | if (event->parent) |
7757 | return; | 7749 | return; |
7758 | 7750 | ||
7759 | if (event->attach_state & PERF_ATTACH_TASK) | 7751 | if (event->attach_state & PERF_ATTACH_TASK) |
7760 | static_key_slow_inc(&perf_sched_events.key); | 7752 | inc = true; |
7761 | if (event->attr.mmap || event->attr.mmap_data) | 7753 | if (event->attr.mmap || event->attr.mmap_data) |
7762 | atomic_inc(&nr_mmap_events); | 7754 | atomic_inc(&nr_mmap_events); |
7763 | if (event->attr.comm) | 7755 | if (event->attr.comm) |
@@ -7770,11 +7762,14 @@ static void account_event(struct perf_event *event) | |||
7770 | } | 7762 | } |
7771 | if (event->attr.context_switch) { | 7763 | if (event->attr.context_switch) { |
7772 | atomic_inc(&nr_switch_events); | 7764 | atomic_inc(&nr_switch_events); |
7773 | static_key_slow_inc(&perf_sched_events.key); | 7765 | inc = true; |
7774 | } | 7766 | } |
7775 | if (has_branch_stack(event)) | 7767 | if (has_branch_stack(event)) |
7776 | static_key_slow_inc(&perf_sched_events.key); | 7768 | inc = true; |
7777 | if (is_cgroup_event(event)) | 7769 | if (is_cgroup_event(event)) |
7770 | inc = true; | ||
7771 | |||
7772 | if (inc) | ||
7778 | static_key_slow_inc(&perf_sched_events.key); | 7773 | static_key_slow_inc(&perf_sched_events.key); |
7779 | 7774 | ||
7780 | account_event_cpu(event, event->cpu); | 7775 | account_event_cpu(event, event->cpu); |
@@ -8422,11 +8417,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8422 | * See perf_event_ctx_lock() for comments on the details | 8417 | * See perf_event_ctx_lock() for comments on the details |
8423 | * of swizzling perf_event::ctx. | 8418 | * of swizzling perf_event::ctx. |
8424 | */ | 8419 | */ |
8425 | perf_remove_from_context(group_leader, false); | 8420 | perf_remove_from_context(group_leader, 0); |
8426 | 8421 | ||
8427 | list_for_each_entry(sibling, &group_leader->sibling_list, | 8422 | list_for_each_entry(sibling, &group_leader->sibling_list, |
8428 | group_entry) { | 8423 | group_entry) { |
8429 | perf_remove_from_context(sibling, false); | 8424 | perf_remove_from_context(sibling, 0); |
8430 | put_ctx(gctx); | 8425 | put_ctx(gctx); |
8431 | } | 8426 | } |
8432 | 8427 | ||
@@ -8479,6 +8474,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8479 | perf_event__header_size(event); | 8474 | perf_event__header_size(event); |
8480 | perf_event__id_header_size(event); | 8475 | perf_event__id_header_size(event); |
8481 | 8476 | ||
8477 | event->owner = current; | ||
8478 | |||
8482 | perf_install_in_context(ctx, event, event->cpu); | 8479 | perf_install_in_context(ctx, event, event->cpu); |
8483 | perf_unpin_context(ctx); | 8480 | perf_unpin_context(ctx); |
8484 | 8481 | ||
@@ -8488,8 +8485,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8488 | 8485 | ||
8489 | put_online_cpus(); | 8486 | put_online_cpus(); |
8490 | 8487 | ||
8491 | event->owner = current; | ||
8492 | |||
8493 | mutex_lock(¤t->perf_event_mutex); | 8488 | mutex_lock(¤t->perf_event_mutex); |
8494 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 8489 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
8495 | mutex_unlock(¤t->perf_event_mutex); | 8490 | mutex_unlock(¤t->perf_event_mutex); |
@@ -8556,7 +8551,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
8556 | } | 8551 | } |
8557 | 8552 | ||
8558 | /* Mark owner so we could distinguish it from user events. */ | 8553 | /* Mark owner so we could distinguish it from user events. */ |
8559 | event->owner = EVENT_OWNER_KERNEL; | 8554 | event->owner = TASK_TOMBSTONE; |
8560 | 8555 | ||
8561 | account_event(event); | 8556 | account_event(event); |
8562 | 8557 | ||
@@ -8606,7 +8601,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
8606 | mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); | 8601 | mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); |
8607 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 8602 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
8608 | event_entry) { | 8603 | event_entry) { |
8609 | perf_remove_from_context(event, false); | 8604 | perf_remove_from_context(event, 0); |
8610 | unaccount_event_cpu(event, src_cpu); | 8605 | unaccount_event_cpu(event, src_cpu); |
8611 | put_ctx(src_ctx); | 8606 | put_ctx(src_ctx); |
8612 | list_add(&event->migrate_entry, &events); | 8607 | list_add(&event->migrate_entry, &events); |
@@ -8673,33 +8668,15 @@ static void sync_child_event(struct perf_event *child_event, | |||
8673 | &parent_event->child_total_time_enabled); | 8668 | &parent_event->child_total_time_enabled); |
8674 | atomic64_add(child_event->total_time_running, | 8669 | atomic64_add(child_event->total_time_running, |
8675 | &parent_event->child_total_time_running); | 8670 | &parent_event->child_total_time_running); |
8676 | |||
8677 | /* | ||
8678 | * Remove this event from the parent's list | ||
8679 | */ | ||
8680 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
8681 | mutex_lock(&parent_event->child_mutex); | ||
8682 | list_del_init(&child_event->child_list); | ||
8683 | mutex_unlock(&parent_event->child_mutex); | ||
8684 | |||
8685 | /* | ||
8686 | * Make sure user/parent get notified, that we just | ||
8687 | * lost one event. | ||
8688 | */ | ||
8689 | perf_event_wakeup(parent_event); | ||
8690 | |||
8691 | /* | ||
8692 | * Release the parent event, if this was the last | ||
8693 | * reference to it. | ||
8694 | */ | ||
8695 | put_event(parent_event); | ||
8696 | } | 8671 | } |
8697 | 8672 | ||
8698 | static void | 8673 | static void |
8699 | __perf_event_exit_task(struct perf_event *child_event, | 8674 | perf_event_exit_event(struct perf_event *child_event, |
8700 | struct perf_event_context *child_ctx, | 8675 | struct perf_event_context *child_ctx, |
8701 | struct task_struct *child) | 8676 | struct task_struct *child) |
8702 | { | 8677 | { |
8678 | struct perf_event *parent_event = child_event->parent; | ||
8679 | |||
8703 | /* | 8680 | /* |
8704 | * Do not destroy the 'original' grouping; because of the context | 8681 | * Do not destroy the 'original' grouping; because of the context |
8705 | * switch optimization the original events could've ended up in a | 8682 | * switch optimization the original events could've ended up in a |
@@ -8712,57 +8689,86 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
8712 | * Do destroy all inherited groups, we don't care about those | 8689 | * Do destroy all inherited groups, we don't care about those |
8713 | * and being thorough is better. | 8690 | * and being thorough is better. |
8714 | */ | 8691 | */ |
8715 | perf_remove_from_context(child_event, !!child_event->parent); | 8692 | raw_spin_lock_irq(&child_ctx->lock); |
8693 | WARN_ON_ONCE(child_ctx->is_active); | ||
8694 | |||
8695 | if (parent_event) | ||
8696 | perf_group_detach(child_event); | ||
8697 | list_del_event(child_event, child_ctx); | ||
8698 | child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ | ||
8699 | raw_spin_unlock_irq(&child_ctx->lock); | ||
8716 | 8700 | ||
8717 | /* | 8701 | /* |
8718 | * It can happen that the parent exits first, and has events | 8702 | * Parent events are governed by their filedesc, retain them. |
8719 | * that are still around due to the child reference. These | ||
8720 | * events need to be zapped. | ||
8721 | */ | 8703 | */ |
8722 | if (child_event->parent) { | 8704 | if (!parent_event) { |
8723 | sync_child_event(child_event, child); | ||
8724 | free_event(child_event); | ||
8725 | } else { | ||
8726 | child_event->state = PERF_EVENT_STATE_EXIT; | ||
8727 | perf_event_wakeup(child_event); | 8705 | perf_event_wakeup(child_event); |
8706 | return; | ||
8728 | } | 8707 | } |
8708 | /* | ||
8709 | * Child events can be cleaned up. | ||
8710 | */ | ||
8711 | |||
8712 | sync_child_event(child_event, child); | ||
8713 | |||
8714 | /* | ||
8715 | * Remove this event from the parent's list | ||
8716 | */ | ||
8717 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
8718 | mutex_lock(&parent_event->child_mutex); | ||
8719 | list_del_init(&child_event->child_list); | ||
8720 | mutex_unlock(&parent_event->child_mutex); | ||
8721 | |||
8722 | /* | ||
8723 | * Kick perf_poll() for is_event_hup(). | ||
8724 | */ | ||
8725 | perf_event_wakeup(parent_event); | ||
8726 | free_event(child_event); | ||
8727 | put_event(parent_event); | ||
8729 | } | 8728 | } |
8730 | 8729 | ||
8731 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 8730 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
8732 | { | 8731 | { |
8733 | struct perf_event *child_event, *next; | ||
8734 | struct perf_event_context *child_ctx, *clone_ctx = NULL; | 8732 | struct perf_event_context *child_ctx, *clone_ctx = NULL; |
8735 | unsigned long flags; | 8733 | struct perf_event *child_event, *next; |
8734 | |||
8735 | WARN_ON_ONCE(child != current); | ||
8736 | 8736 | ||
8737 | if (likely(!child->perf_event_ctxp[ctxn])) | 8737 | child_ctx = perf_pin_task_context(child, ctxn); |
8738 | if (!child_ctx) | ||
8738 | return; | 8739 | return; |
8739 | 8740 | ||
8740 | local_irq_save(flags); | ||
8741 | /* | 8741 | /* |
8742 | * We can't reschedule here because interrupts are disabled, | 8742 | * In order to reduce the amount of tricky in ctx tear-down, we hold |
8743 | * and either child is current or it is a task that can't be | 8743 | * ctx::mutex over the entire thing. This serializes against almost |
8744 | * scheduled, so we are now safe from rescheduling changing | 8744 | * everything that wants to access the ctx. |
8745 | * our context. | 8745 | * |
8746 | * The exception is sys_perf_event_open() / | ||
8747 | * perf_event_create_kernel_count() which does find_get_context() | ||
8748 | * without ctx::mutex (it cannot because of the move_group double mutex | ||
8749 | * lock thing). See the comments in perf_install_in_context(). | ||
8746 | */ | 8750 | */ |
8747 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); | 8751 | mutex_lock(&child_ctx->mutex); |
8748 | 8752 | ||
8749 | /* | 8753 | /* |
8750 | * Take the context lock here so that if find_get_context is | 8754 | * In a single ctx::lock section, de-schedule the events and detach the |
8751 | * reading child->perf_event_ctxp, we wait until it has | 8755 | * context from the task such that we cannot ever get it scheduled back |
8752 | * incremented the context's refcount before we do put_ctx below. | 8756 | * in. |
8753 | */ | 8757 | */ |
8754 | raw_spin_lock(&child_ctx->lock); | 8758 | raw_spin_lock_irq(&child_ctx->lock); |
8755 | task_ctx_sched_out(child_ctx); | 8759 | task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); |
8756 | child->perf_event_ctxp[ctxn] = NULL; | ||
8757 | 8760 | ||
8758 | /* | 8761 | /* |
8759 | * If this context is a clone; unclone it so it can't get | 8762 | * Now that the context is inactive, destroy the task <-> ctx relation |
8760 | * swapped to another process while we're removing all | 8763 | * and mark the context dead. |
8761 | * the events from it. | ||
8762 | */ | 8764 | */ |
8765 | RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); | ||
8766 | put_ctx(child_ctx); /* cannot be last */ | ||
8767 | WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); | ||
8768 | put_task_struct(current); /* cannot be last */ | ||
8769 | |||
8763 | clone_ctx = unclone_ctx(child_ctx); | 8770 | clone_ctx = unclone_ctx(child_ctx); |
8764 | update_context_time(child_ctx); | 8771 | raw_spin_unlock_irq(&child_ctx->lock); |
8765 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
8766 | 8772 | ||
8767 | if (clone_ctx) | 8773 | if (clone_ctx) |
8768 | put_ctx(clone_ctx); | 8774 | put_ctx(clone_ctx); |
@@ -8774,20 +8780,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
8774 | */ | 8780 | */ |
8775 | perf_event_task(child, child_ctx, 0); | 8781 | perf_event_task(child, child_ctx, 0); |
8776 | 8782 | ||
8777 | /* | ||
8778 | * We can recurse on the same lock type through: | ||
8779 | * | ||
8780 | * __perf_event_exit_task() | ||
8781 | * sync_child_event() | ||
8782 | * put_event() | ||
8783 | * mutex_lock(&ctx->mutex) | ||
8784 | * | ||
8785 | * But since its the parent context it won't be the same instance. | ||
8786 | */ | ||
8787 | mutex_lock(&child_ctx->mutex); | ||
8788 | |||
8789 | list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) | 8783 | list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) |
8790 | __perf_event_exit_task(child_event, child_ctx, child); | 8784 | perf_event_exit_event(child_event, child_ctx, child); |
8791 | 8785 | ||
8792 | mutex_unlock(&child_ctx->mutex); | 8786 | mutex_unlock(&child_ctx->mutex); |
8793 | 8787 | ||
@@ -8812,8 +8806,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
8812 | * the owner, closes a race against perf_release() where | 8806 | * the owner, closes a race against perf_release() where |
8813 | * we need to serialize on the owner->perf_event_mutex. | 8807 | * we need to serialize on the owner->perf_event_mutex. |
8814 | */ | 8808 | */ |
8815 | smp_wmb(); | 8809 | smp_store_release(&event->owner, NULL); |
8816 | event->owner = NULL; | ||
8817 | } | 8810 | } |
8818 | mutex_unlock(&child->perf_event_mutex); | 8811 | mutex_unlock(&child->perf_event_mutex); |
8819 | 8812 | ||
@@ -8896,21 +8889,20 @@ void perf_event_delayed_put(struct task_struct *task) | |||
8896 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | 8889 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); |
8897 | } | 8890 | } |
8898 | 8891 | ||
8899 | struct perf_event *perf_event_get(unsigned int fd) | 8892 | struct file *perf_event_get(unsigned int fd) |
8900 | { | 8893 | { |
8901 | int err; | 8894 | struct file *file; |
8902 | struct fd f; | ||
8903 | struct perf_event *event; | ||
8904 | 8895 | ||
8905 | err = perf_fget_light(fd, &f); | 8896 | file = fget_raw(fd); |
8906 | if (err) | 8897 | if (!file) |
8907 | return ERR_PTR(err); | 8898 | return ERR_PTR(-EBADF); |
8908 | 8899 | ||
8909 | event = f.file->private_data; | 8900 | if (file->f_op != &perf_fops) { |
8910 | atomic_long_inc(&event->refcount); | 8901 | fput(file); |
8911 | fdput(f); | 8902 | return ERR_PTR(-EBADF); |
8903 | } | ||
8912 | 8904 | ||
8913 | return event; | 8905 | return file; |
8914 | } | 8906 | } |
8915 | 8907 | ||
8916 | const struct perf_event_attr *perf_event_attrs(struct perf_event *event) | 8908 | const struct perf_event_attr *perf_event_attrs(struct perf_event *event) |
@@ -8953,8 +8945,16 @@ inherit_event(struct perf_event *parent_event, | |||
8953 | if (IS_ERR(child_event)) | 8945 | if (IS_ERR(child_event)) |
8954 | return child_event; | 8946 | return child_event; |
8955 | 8947 | ||
8948 | /* | ||
8949 | * is_orphaned_event() and list_add_tail(&parent_event->child_list) | ||
8950 | * must be under the same lock in order to serialize against | ||
8951 | * perf_event_release_kernel(), such that either we must observe | ||
8952 | * is_orphaned_event() or they will observe us on the child_list. | ||
8953 | */ | ||
8954 | mutex_lock(&parent_event->child_mutex); | ||
8956 | if (is_orphaned_event(parent_event) || | 8955 | if (is_orphaned_event(parent_event) || |
8957 | !atomic_long_inc_not_zero(&parent_event->refcount)) { | 8956 | !atomic_long_inc_not_zero(&parent_event->refcount)) { |
8957 | mutex_unlock(&parent_event->child_mutex); | ||
8958 | free_event(child_event); | 8958 | free_event(child_event); |
8959 | return NULL; | 8959 | return NULL; |
8960 | } | 8960 | } |
@@ -9002,8 +9002,6 @@ inherit_event(struct perf_event *parent_event, | |||
9002 | /* | 9002 | /* |
9003 | * Link this into the parent event's child list | 9003 | * Link this into the parent event's child list |
9004 | */ | 9004 | */ |
9005 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
9006 | mutex_lock(&parent_event->child_mutex); | ||
9007 | list_add_tail(&child_event->child_list, &parent_event->child_list); | 9005 | list_add_tail(&child_event->child_list, &parent_event->child_list); |
9008 | mutex_unlock(&parent_event->child_mutex); | 9006 | mutex_unlock(&parent_event->child_mutex); |
9009 | 9007 | ||
@@ -9221,13 +9219,14 @@ static void perf_event_init_cpu(int cpu) | |||
9221 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE | 9219 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE |
9222 | static void __perf_event_exit_context(void *__info) | 9220 | static void __perf_event_exit_context(void *__info) |
9223 | { | 9221 | { |
9224 | struct remove_event re = { .detach_group = true }; | ||
9225 | struct perf_event_context *ctx = __info; | 9222 | struct perf_event_context *ctx = __info; |
9223 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
9224 | struct perf_event *event; | ||
9226 | 9225 | ||
9227 | rcu_read_lock(); | 9226 | raw_spin_lock(&ctx->lock); |
9228 | list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) | 9227 | list_for_each_entry(event, &ctx->event_list, event_entry) |
9229 | __perf_remove_from_context(&re); | 9228 | __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); |
9230 | rcu_read_unlock(); | 9229 | raw_spin_unlock(&ctx->lock); |
9231 | } | 9230 | } |
9232 | 9231 | ||
9233 | static void perf_event_exit_cpu_context(int cpu) | 9232 | static void perf_event_exit_cpu_context(int cpu) |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att | |||
444 | * current task. | 444 | * current task. |
445 | */ | 445 | */ |
446 | if (irqs_disabled() && bp->ctx && bp->ctx->task == current) | 446 | if (irqs_disabled() && bp->ctx && bp->ctx->task == current) |
447 | __perf_event_disable(bp); | 447 | perf_event_disable_local(bp); |
448 | else | 448 | else |
449 | perf_event_disable(bp); | 449 | perf_event_disable(bp); |
450 | 450 | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..1faad2cfdb9e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) | |||
459 | __free_page(page); | 459 | __free_page(page); |
460 | } | 460 | } |
461 | 461 | ||
462 | static void __rb_free_aux(struct ring_buffer *rb) | ||
463 | { | ||
464 | int pg; | ||
465 | |||
466 | if (rb->aux_priv) { | ||
467 | rb->free_aux(rb->aux_priv); | ||
468 | rb->free_aux = NULL; | ||
469 | rb->aux_priv = NULL; | ||
470 | } | ||
471 | |||
472 | if (rb->aux_nr_pages) { | ||
473 | for (pg = 0; pg < rb->aux_nr_pages; pg++) | ||
474 | rb_free_aux_page(rb, pg); | ||
475 | |||
476 | kfree(rb->aux_pages); | ||
477 | rb->aux_nr_pages = 0; | ||
478 | } | ||
479 | } | ||
480 | |||
462 | int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | 481 | int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, |
463 | pgoff_t pgoff, int nr_pages, long watermark, int flags) | 482 | pgoff_t pgoff, int nr_pages, long watermark, int flags) |
464 | { | 483 | { |
@@ -547,30 +566,11 @@ out: | |||
547 | if (!ret) | 566 | if (!ret) |
548 | rb->aux_pgoff = pgoff; | 567 | rb->aux_pgoff = pgoff; |
549 | else | 568 | else |
550 | rb_free_aux(rb); | 569 | __rb_free_aux(rb); |
551 | 570 | ||
552 | return ret; | 571 | return ret; |
553 | } | 572 | } |
554 | 573 | ||
555 | static void __rb_free_aux(struct ring_buffer *rb) | ||
556 | { | ||
557 | int pg; | ||
558 | |||
559 | if (rb->aux_priv) { | ||
560 | rb->free_aux(rb->aux_priv); | ||
561 | rb->free_aux = NULL; | ||
562 | rb->aux_priv = NULL; | ||
563 | } | ||
564 | |||
565 | if (rb->aux_nr_pages) { | ||
566 | for (pg = 0; pg < rb->aux_nr_pages; pg++) | ||
567 | rb_free_aux_page(rb, pg); | ||
568 | |||
569 | kfree(rb->aux_pages); | ||
570 | rb->aux_nr_pages = 0; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | void rb_free_aux(struct ring_buffer *rb) | 574 | void rb_free_aux(struct ring_buffer *rb) |
575 | { | 575 | { |
576 | if (atomic_dec_and_test(&rb->aux_refcount)) | 576 | if (atomic_dec_and_test(&rb->aux_refcount)) |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..326a75e884db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | |||
191 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | 191 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; |
192 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 192 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
193 | struct perf_event *event; | 193 | struct perf_event *event; |
194 | struct file *file; | ||
194 | 195 | ||
195 | if (unlikely(index >= array->map.max_entries)) | 196 | if (unlikely(index >= array->map.max_entries)) |
196 | return -E2BIG; | 197 | return -E2BIG; |
197 | 198 | ||
198 | event = (struct perf_event *)array->ptrs[index]; | 199 | file = (struct file *)array->ptrs[index]; |
199 | if (!event) | 200 | if (unlikely(!file)) |
200 | return -ENOENT; | 201 | return -ENOENT; |
201 | 202 | ||
203 | event = file->private_data; | ||
204 | |||
202 | /* make sure event is local and doesn't have pmu::count */ | 205 | /* make sure event is local and doesn't have pmu::count */ |
203 | if (event->oncpu != smp_processor_id() || | 206 | if (event->oncpu != smp_processor_id() || |
204 | event->pmu->count) | 207 | event->pmu->count) |
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) | |||
228 | void *data = (void *) (long) r4; | 231 | void *data = (void *) (long) r4; |
229 | struct perf_sample_data sample_data; | 232 | struct perf_sample_data sample_data; |
230 | struct perf_event *event; | 233 | struct perf_event *event; |
234 | struct file *file; | ||
231 | struct perf_raw_record raw = { | 235 | struct perf_raw_record raw = { |
232 | .size = size, | 236 | .size = size, |
233 | .data = data, | 237 | .data = data, |
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) | |||
236 | if (unlikely(index >= array->map.max_entries)) | 240 | if (unlikely(index >= array->map.max_entries)) |
237 | return -E2BIG; | 241 | return -E2BIG; |
238 | 242 | ||
239 | event = (struct perf_event *)array->ptrs[index]; | 243 | file = (struct file *)array->ptrs[index]; |
240 | if (unlikely(!event)) | 244 | if (unlikely(!file)) |
241 | return -ENOENT; | 245 | return -ENOENT; |
242 | 246 | ||
247 | event = file->private_data; | ||
248 | |||
243 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || | 249 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || |
244 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) | 250 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) |
245 | return -EINVAL; | 251 | return -EINVAL; |
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 0a22407e1d7d..5d34815c7ccb 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf | |||
@@ -77,6 +77,9 @@ include config/utilities.mak | |||
77 | # Define NO_AUXTRACE if you do not want AUX area tracing support | 77 | # Define NO_AUXTRACE if you do not want AUX area tracing support |
78 | # | 78 | # |
79 | # Define NO_LIBBPF if you do not want BPF support | 79 | # Define NO_LIBBPF if you do not want BPF support |
80 | # | ||
81 | # Define FEATURES_DUMP to provide features detection dump file | ||
82 | # and bypass the feature detection | ||
80 | 83 | ||
81 | # As per kernel Makefile, avoid funny character set dependencies | 84 | # As per kernel Makefile, avoid funny character set dependencies |
82 | unexport LC_ALL | 85 | unexport LC_ALL |
@@ -166,6 +169,15 @@ ifeq ($(config),1) | |||
166 | include config/Makefile | 169 | include config/Makefile |
167 | endif | 170 | endif |
168 | 171 | ||
172 | # The FEATURE_DUMP_EXPORT holds location of the actual | ||
173 | # FEATURE_DUMP file to be used to bypass feature detection | ||
174 | # (for bpf or any other subproject) | ||
175 | ifeq ($(FEATURES_DUMP),) | ||
176 | FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP) | ||
177 | else | ||
178 | FEATURE_DUMP_EXPORT := $(FEATURES_DUMP) | ||
179 | endif | ||
180 | |||
169 | export prefix bindir sharedir sysconfdir DESTDIR | 181 | export prefix bindir sharedir sysconfdir DESTDIR |
170 | 182 | ||
171 | # sparse is architecture-neutral, which means that we need to tell it | 183 | # sparse is architecture-neutral, which means that we need to tell it |
@@ -436,7 +448,7 @@ $(LIBAPI)-clean: | |||
436 | $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null | 448 | $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null |
437 | 449 | ||
438 | $(LIBBPF): fixdep FORCE | 450 | $(LIBBPF): fixdep FORCE |
439 | $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP) | 451 | $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) |
440 | 452 | ||
441 | $(LIBBPF)-clean: | 453 | $(LIBBPF)-clean: |
442 | $(call QUIET_CLEAN, libbpf) | 454 | $(call QUIET_CLEAN, libbpf) |
@@ -611,6 +623,17 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean | |||
611 | $(python-clean) | 623 | $(python-clean) |
612 | 624 | ||
613 | # | 625 | # |
626 | # To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) | ||
627 | # file if defined, with no further action. | ||
628 | feature-dump: | ||
629 | ifdef FEATURE_DUMP_COPY | ||
630 | @cp $(OUTPUT)FEATURE-DUMP $(FEATURE_DUMP_COPY) | ||
631 | @echo "FEATURE-DUMP file copied into $(FEATURE_DUMP_COPY)" | ||
632 | else | ||
633 | @echo "FEATURE-DUMP file available in $(OUTPUT)FEATURE-DUMP" | ||
634 | endif | ||
635 | |||
636 | # | ||
614 | # Trick: if ../../.git does not exist - we are building out of tree for example, | 637 | # Trick: if ../../.git does not exist - we are building out of tree for example, |
615 | # then force version regeneration: | 638 | # then force version regeneration: |
616 | # | 639 | # |
diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c index 3e89ba825f6b..7f064eb37158 100644 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ b/tools/perf/arch/x86/tests/intel-cqm.c | |||
@@ -17,7 +17,7 @@ static pid_t spawn(void) | |||
17 | if (pid) | 17 | if (pid) |
18 | return pid; | 18 | return pid; |
19 | 19 | ||
20 | while(1); | 20 | while(1) |
21 | sleep(5); | 21 | sleep(5); |
22 | return 0; | 22 | return 0; |
23 | } | 23 | } |
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index e5959c136a19..511141b102e8 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile | |||
@@ -181,7 +181,11 @@ LDFLAGS += -Wl,-z,noexecstack | |||
181 | 181 | ||
182 | EXTLIBS = -lpthread -lrt -lm -ldl | 182 | EXTLIBS = -lpthread -lrt -lm -ldl |
183 | 183 | ||
184 | ifeq ($(FEATURES_DUMP),) | ||
184 | include $(srctree)/tools/build/Makefile.feature | 185 | include $(srctree)/tools/build/Makefile.feature |
186 | else | ||
187 | include $(FEATURES_DUMP) | ||
188 | endif | ||
185 | 189 | ||
186 | ifeq ($(feature-stackprotector-all), 1) | 190 | ifeq ($(feature-stackprotector-all), 1) |
187 | CFLAGS += -fstack-protector-all | 191 | CFLAGS += -fstack-protector-all |
diff --git a/tools/perf/tests/make b/tools/perf/tests/make index df38decc48c3..f918015512af 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make | |||
@@ -5,7 +5,7 @@ ifeq ($(MAKECMDGOALS),) | |||
5 | # no target specified, trigger the whole suite | 5 | # no target specified, trigger the whole suite |
6 | all: | 6 | all: |
7 | @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile | 7 | @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile |
8 | @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf | 8 | @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 SET_O=1 |
9 | else | 9 | else |
10 | # run only specific test over 'Makefile' | 10 | # run only specific test over 'Makefile' |
11 | %: | 11 | %: |
@@ -13,6 +13,26 @@ else | |||
13 | endif | 13 | endif |
14 | else | 14 | else |
15 | PERF := . | 15 | PERF := . |
16 | PERF_O := $(PERF) | ||
17 | O_OPT := | ||
18 | |||
19 | ifneq ($(O),) | ||
20 | FULL_O := $(shell readlink -f $(O) || echo $(O)) | ||
21 | PERF_O := $(FULL_O) | ||
22 | ifeq ($(SET_O),1) | ||
23 | O_OPT := 'O=$(FULL_O)' | ||
24 | endif | ||
25 | K_O_OPT := 'O=$(FULL_O)' | ||
26 | endif | ||
27 | |||
28 | PARALLEL_OPT= | ||
29 | ifeq ($(SET_PARALLEL),1) | ||
30 | cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null) | ||
31 | ifeq ($(cores),0) | ||
32 | cores := 1 | ||
33 | endif | ||
34 | PARALLEL_OPT="-j$(cores)" | ||
35 | endif | ||
16 | 36 | ||
17 | # As per kernel Makefile, avoid funny character set dependencies | 37 | # As per kernel Makefile, avoid funny character set dependencies |
18 | unexport LC_ALL | 38 | unexport LC_ALL |
@@ -156,11 +176,11 @@ test_make_doc := $(test_ok) | |||
156 | test_make_help_O := $(test_ok) | 176 | test_make_help_O := $(test_ok) |
157 | test_make_doc_O := $(test_ok) | 177 | test_make_doc_O := $(test_ok) |
158 | 178 | ||
159 | test_make_python_perf_so := test -f $(PERF)/python/perf.so | 179 | test_make_python_perf_so := test -f $(PERF_O)/python/perf.so |
160 | 180 | ||
161 | test_make_perf_o := test -f $(PERF)/perf.o | 181 | test_make_perf_o := test -f $(PERF_O)/perf.o |
162 | test_make_util_map_o := test -f $(PERF)/util/map.o | 182 | test_make_util_map_o := test -f $(PERF_O)/util/map.o |
163 | test_make_util_pmu_bison_o := test -f $(PERF)/util/pmu-bison.o | 183 | test_make_util_pmu_bison_o := test -f $(PERF_O)/util/pmu-bison.o |
164 | 184 | ||
165 | define test_dest_files | 185 | define test_dest_files |
166 | for file in $(1); do \ | 186 | for file in $(1); do \ |
@@ -227,7 +247,7 @@ test_make_perf_o_O := test -f $$TMP_O/perf.o | |||
227 | test_make_util_map_o_O := test -f $$TMP_O/util/map.o | 247 | test_make_util_map_o_O := test -f $$TMP_O/util/map.o |
228 | test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o | 248 | test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o |
229 | 249 | ||
230 | test_default = test -x $(PERF)/perf | 250 | test_default = test -x $(PERF_O)/perf |
231 | test = $(if $(test_$1),$(test_$1),$(test_default)) | 251 | test = $(if $(test_$1),$(test_$1),$(test_default)) |
232 | 252 | ||
233 | test_default_O = test -x $$TMP_O/perf | 253 | test_default_O = test -x $$TMP_O/perf |
@@ -247,12 +267,12 @@ endif | |||
247 | 267 | ||
248 | MAKEFLAGS := --no-print-directory | 268 | MAKEFLAGS := --no-print-directory |
249 | 269 | ||
250 | clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null) | 270 | clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null) |
251 | 271 | ||
252 | $(run): | 272 | $(run): |
253 | $(call clean) | 273 | $(call clean) |
254 | @TMP_DEST=$$(mktemp -d); \ | 274 | @TMP_DEST=$$(mktemp -d); \ |
255 | cmd="cd $(PERF) && make -f $(MK) DESTDIR=$$TMP_DEST $($@)"; \ | 275 | cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \ |
256 | echo "- $@: $$cmd" && echo $$cmd > $@ && \ | 276 | echo "- $@: $$cmd" && echo $$cmd > $@ && \ |
257 | ( eval $$cmd ) >> $@ 2>&1; \ | 277 | ( eval $$cmd ) >> $@ 2>&1; \ |
258 | echo " test: $(call test,$@)" >> $@ 2>&1; \ | 278 | echo " test: $(call test,$@)" >> $@ 2>&1; \ |
@@ -263,7 +283,7 @@ $(run_O): | |||
263 | $(call clean) | 283 | $(call clean) |
264 | @TMP_O=$$(mktemp -d); \ | 284 | @TMP_O=$$(mktemp -d); \ |
265 | TMP_DEST=$$(mktemp -d); \ | 285 | TMP_DEST=$$(mktemp -d); \ |
266 | cmd="cd $(PERF) && make -f $(MK) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ | 286 | cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ |
267 | echo "- $@: $$cmd" && echo $$cmd > $@ && \ | 287 | echo "- $@: $$cmd" && echo $$cmd > $@ && \ |
268 | ( eval $$cmd ) >> $@ 2>&1 && \ | 288 | ( eval $$cmd ) >> $@ 2>&1 && \ |
269 | echo " test: $(call test_O,$@)" >> $@ 2>&1; \ | 289 | echo " test: $(call test_O,$@)" >> $@ 2>&1; \ |
@@ -276,17 +296,22 @@ tarpkg: | |||
276 | ( eval $$cmd ) >> $@ 2>&1 && \ | 296 | ( eval $$cmd ) >> $@ 2>&1 && \ |
277 | rm -f $@ | 297 | rm -f $@ |
278 | 298 | ||
299 | KERNEL_O := ../.. | ||
300 | ifneq ($(O),) | ||
301 | KERNEL_O := $(O) | ||
302 | endif | ||
303 | |||
279 | make_kernelsrc: | 304 | make_kernelsrc: |
280 | @echo "- make -C <kernelsrc> tools/perf" | 305 | @echo "- make -C <kernelsrc> $(PARALLEL_OPT) $(K_O_OPT) tools/perf" |
281 | $(call clean); \ | 306 | $(call clean); \ |
282 | (make -C ../.. tools/perf) > $@ 2>&1 && \ | 307 | (make -C ../.. $(PARALLEL_OPT) $(K_O_OPT) tools/perf) > $@ 2>&1 && \ |
283 | test -x perf && rm -f $@ || (cat $@ ; false) | 308 | test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) |
284 | 309 | ||
285 | make_kernelsrc_tools: | 310 | make_kernelsrc_tools: |
286 | @echo "- make -C <kernelsrc>/tools perf" | 311 | @echo "- make -C <kernelsrc>/tools $(PARALLEL_OPT) $(K_O_OPT) perf" |
287 | $(call clean); \ | 312 | $(call clean); \ |
288 | (make -C ../../tools perf) > $@ 2>&1 && \ | 313 | (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \ |
289 | test -x perf && rm -f $@ || (cat $@ ; false) | 314 | test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) |
290 | 315 | ||
291 | all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools | 316 | all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools |
292 | @echo OK | 317 | @echo OK |
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index d4d7cc27252f..718bd46d47fa 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c | |||
@@ -755,11 +755,11 @@ static int annotate_browser__run(struct annotate_browser *browser, | |||
755 | nd = browser->curr_hot; | 755 | nd = browser->curr_hot; |
756 | break; | 756 | break; |
757 | case K_UNTAB: | 757 | case K_UNTAB: |
758 | if (nd != NULL) | 758 | if (nd != NULL) { |
759 | nd = rb_next(nd); | 759 | nd = rb_next(nd); |
760 | if (nd == NULL) | 760 | if (nd == NULL) |
761 | nd = rb_first(&browser->entries); | 761 | nd = rb_first(&browser->entries); |
762 | else | 762 | } else |
763 | nd = browser->curr_hot; | 763 | nd = browser->curr_hot; |
764 | break; | 764 | break; |
765 | case K_F1: | 765 | case K_F1: |
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index c226303e3da0..68a7612019dc 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c | |||
@@ -131,6 +131,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) | |||
131 | symlen = unresolved_col_width + 4 + 2; | 131 | symlen = unresolved_col_width + 4 + 2; |
132 | hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, | 132 | hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, |
133 | symlen); | 133 | symlen); |
134 | hists__new_col_len(hists, HISTC_MEM_DCACHELINE, | ||
135 | symlen); | ||
134 | } | 136 | } |
135 | 137 | ||
136 | if (h->mem_info->iaddr.sym) { | 138 | if (h->mem_info->iaddr.sym) { |
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d5636ba94b20..40b7a0d0905b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c | |||
@@ -1149,7 +1149,7 @@ static struct machine *machines__find_for_cpumode(struct machines *machines, | |||
1149 | 1149 | ||
1150 | machine = machines__find(machines, pid); | 1150 | machine = machines__find(machines, pid); |
1151 | if (!machine) | 1151 | if (!machine) |
1152 | machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID); | 1152 | machine = machines__findnew(machines, DEFAULT_GUEST_KERNEL_ID); |
1153 | return machine; | 1153 | return machine; |
1154 | } | 1154 | } |
1155 | 1155 | ||
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 2f901d15e063..2b58edccd56f 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c | |||
@@ -310,7 +310,6 @@ int perf_stat_process_counter(struct perf_stat_config *config, | |||
310 | int i, ret; | 310 | int i, ret; |
311 | 311 | ||
312 | aggr->val = aggr->ena = aggr->run = 0; | 312 | aggr->val = aggr->ena = aggr->run = 0; |
313 | init_stats(ps->res_stats); | ||
314 | 313 | ||
315 | if (counter->per_pkg) | 314 | if (counter->per_pkg) |
316 | zero_per_pkg(counter); | 315 | zero_per_pkg(counter); |
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 3b2de6eb3376..ab02209a7cf3 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c | |||
@@ -1466,7 +1466,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) | |||
1466 | * Read the build id if possible. This is required for | 1466 | * Read the build id if possible. This is required for |
1467 | * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work | 1467 | * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work |
1468 | */ | 1468 | */ |
1469 | if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0) | 1469 | if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0) |
1470 | dso__set_build_id(dso, build_id); | 1470 | dso__set_build_id(dso, build_id); |
1471 | 1471 | ||
1472 | /* | 1472 | /* |