aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-31 18:38:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-31 18:38:27 -0500
commit29d14f083522e5bc762256f68227d267118946c8 (patch)
tree124ae23890efad2ac482f84d525779ed44329875 /kernel
parentbbfb239a106d41d793f58befdaf5c806e34ea97e (diff)
parent28fb8a5b6e233fc384fb27f9f91f811b40ba9cf8 (diff)
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Thomas Gleixner: "This is much bigger than typical fixes, but Peter found a category of races that spurred more fixes and more debugging enhancements. Work started before the merge window, but got finished only now. Aside of that this contains the usual small fixes to perf and tools. Nothing particular exciting" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits) perf: Remove/simplify lockdep annotation perf: Synchronously clean up child events perf: Untangle 'owner' confusion perf: Add flags argument to perf_remove_from_context() perf: Clean up sync_child_event() perf: Robustify event->owner usage and SMP ordering perf: Fix STATE_EXIT usage perf: Update locking order perf: Remove __free_event() perf/bpf: Convert perf_event_array to use struct file perf: Fix NULL deref perf/x86: De-obfuscate code perf/x86: Fix uninitialized value usage perf: Fix race in perf_event_exit_task_context() perf: Fix orphan hole perf stat: Do not clean event's private stats perf hists: Fix HISTC_MEM_DCACHELINE width setting perf annotate browser: Fix behaviour of Shift-Tab with nothing focussed perf tests: Remove wrong semicolon in while loop in CQM test perf: Synchronously free aux pages in case of allocation failure ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/events/core.c1199
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/trace/bpf_trace.c14
5 files changed, 641 insertions, 635 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..89ebbc4d1164 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
291{ 291{
292 struct perf_event *event; 292 struct perf_event *event;
293 const struct perf_event_attr *attr; 293 const struct perf_event_attr *attr;
294 struct file *file;
294 295
295 event = perf_event_get(fd); 296 file = perf_event_get(fd);
296 if (IS_ERR(event)) 297 if (IS_ERR(file))
297 return event; 298 return file;
299
300 event = file->private_data;
298 301
299 attr = perf_event_attrs(event); 302 attr = perf_event_attrs(event);
300 if (IS_ERR(attr)) 303 if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
304 goto err; 307 goto err;
305 308
306 if (attr->type == PERF_TYPE_RAW) 309 if (attr->type == PERF_TYPE_RAW)
307 return event; 310 return file;
308 311
309 if (attr->type == PERF_TYPE_HARDWARE) 312 if (attr->type == PERF_TYPE_HARDWARE)
310 return event; 313 return file;
311 314
312 if (attr->type == PERF_TYPE_SOFTWARE && 315 if (attr->type == PERF_TYPE_SOFTWARE &&
313 attr->config == PERF_COUNT_SW_BPF_OUTPUT) 316 attr->config == PERF_COUNT_SW_BPF_OUTPUT)
314 return event; 317 return file;
315err: 318err:
316 perf_event_release_kernel(event); 319 fput(file);
317 return ERR_PTR(-EINVAL); 320 return ERR_PTR(-EINVAL);
318} 321}
319 322
320static void perf_event_fd_array_put_ptr(void *ptr) 323static void perf_event_fd_array_put_ptr(void *ptr)
321{ 324{
322 struct perf_event *event = ptr; 325 fput((struct file *)ptr);
323
324 perf_event_release_kernel(event);
325} 326}
326 327
327static const struct bpf_map_ops perf_event_array_ops = { 328static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 06ae52e99ac2..5946460b2425 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
49 49
50#include <asm/irq_regs.h> 50#include <asm/irq_regs.h>
51 51
52static struct workqueue_struct *perf_wq;
53
54typedef int (*remote_function_f)(void *); 52typedef int (*remote_function_f)(void *);
55 53
56struct remote_function_call { 54struct remote_function_call {
@@ -126,44 +124,181 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
126 return data.ret; 124 return data.ret;
127} 125}
128 126
129static void event_function_call(struct perf_event *event, 127static inline struct perf_cpu_context *
130 int (*active)(void *), 128__get_cpu_context(struct perf_event_context *ctx)
131 void (*inactive)(void *), 129{
132 void *data) 130 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
131}
132
133static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
134 struct perf_event_context *ctx)
135{
136 raw_spin_lock(&cpuctx->ctx.lock);
137 if (ctx)
138 raw_spin_lock(&ctx->lock);
139}
140
141static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
142 struct perf_event_context *ctx)
143{
144 if (ctx)
145 raw_spin_unlock(&ctx->lock);
146 raw_spin_unlock(&cpuctx->ctx.lock);
147}
148
149#define TASK_TOMBSTONE ((void *)-1L)
150
151static bool is_kernel_event(struct perf_event *event)
152{
153 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
154}
155
156/*
157 * On task ctx scheduling...
158 *
159 * When !ctx->nr_events a task context will not be scheduled. This means
160 * we can disable the scheduler hooks (for performance) without leaving
161 * pending task ctx state.
162 *
163 * This however results in two special cases:
164 *
165 * - removing the last event from a task ctx; this is relatively straight
166 * forward and is done in __perf_remove_from_context.
167 *
168 * - adding the first event to a task ctx; this is tricky because we cannot
169 * rely on ctx->is_active and therefore cannot use event_function_call().
170 * See perf_install_in_context().
171 *
172 * This is because we need a ctx->lock serialized variable (ctx->is_active)
173 * to reliably determine if a particular task/context is scheduled in. The
174 * task_curr() use in task_function_call() is racy in that a remote context
175 * switch is not a single atomic operation.
176 *
177 * As is, the situation is 'safe' because we set rq->curr before we do the
178 * actual context switch. This means that task_curr() will fail early, but
179 * we'll continue spinning on ctx->is_active until we've passed
180 * perf_event_task_sched_out().
181 *
182 * Without this ctx->lock serialized variable we could have race where we find
183 * the task (and hence the context) would not be active while in fact they are.
184 *
185 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
186 */
187
188typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
189 struct perf_event_context *, void *);
190
191struct event_function_struct {
192 struct perf_event *event;
193 event_f func;
194 void *data;
195};
196
197static int event_function(void *info)
198{
199 struct event_function_struct *efs = info;
200 struct perf_event *event = efs->event;
201 struct perf_event_context *ctx = event->ctx;
202 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
203 struct perf_event_context *task_ctx = cpuctx->task_ctx;
204 int ret = 0;
205
206 WARN_ON_ONCE(!irqs_disabled());
207
208 perf_ctx_lock(cpuctx, task_ctx);
209 /*
210 * Since we do the IPI call without holding ctx->lock things can have
211 * changed, double check we hit the task we set out to hit.
212 */
213 if (ctx->task) {
214 if (ctx->task != current) {
215 ret = -EAGAIN;
216 goto unlock;
217 }
218
219 /*
220 * We only use event_function_call() on established contexts,
221 * and event_function() is only ever called when active (or
222 * rather, we'll have bailed in task_function_call() or the
223 * above ctx->task != current test), therefore we must have
224 * ctx->is_active here.
225 */
226 WARN_ON_ONCE(!ctx->is_active);
227 /*
228 * And since we have ctx->is_active, cpuctx->task_ctx must
229 * match.
230 */
231 WARN_ON_ONCE(task_ctx != ctx);
232 } else {
233 WARN_ON_ONCE(&cpuctx->ctx != ctx);
234 }
235
236 efs->func(event, cpuctx, ctx, efs->data);
237unlock:
238 perf_ctx_unlock(cpuctx, task_ctx);
239
240 return ret;
241}
242
243static void event_function_local(struct perf_event *event, event_f func, void *data)
244{
245 struct event_function_struct efs = {
246 .event = event,
247 .func = func,
248 .data = data,
249 };
250
251 int ret = event_function(&efs);
252 WARN_ON_ONCE(ret);
253}
254
255static void event_function_call(struct perf_event *event, event_f func, void *data)
133{ 256{
134 struct perf_event_context *ctx = event->ctx; 257 struct perf_event_context *ctx = event->ctx;
135 struct task_struct *task = ctx->task; 258 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
259 struct event_function_struct efs = {
260 .event = event,
261 .func = func,
262 .data = data,
263 };
264
265 if (!event->parent) {
266 /*
267 * If this is a !child event, we must hold ctx::mutex to
268 * stabilize the the event->ctx relation. See
269 * perf_event_ctx_lock().
270 */
271 lockdep_assert_held(&ctx->mutex);
272 }
136 273
137 if (!task) { 274 if (!task) {
138 cpu_function_call(event->cpu, active, data); 275 cpu_function_call(event->cpu, event_function, &efs);
139 return; 276 return;
140 } 277 }
141 278
142again: 279again:
143 if (!task_function_call(task, active, data)) 280 if (task == TASK_TOMBSTONE)
281 return;
282
283 if (!task_function_call(task, event_function, &efs))
144 return; 284 return;
145 285
146 raw_spin_lock_irq(&ctx->lock); 286 raw_spin_lock_irq(&ctx->lock);
147 if (ctx->is_active) { 287 /*
148 /* 288 * Reload the task pointer, it might have been changed by
149 * Reload the task pointer, it might have been changed by 289 * a concurrent perf_event_context_sched_out().
150 * a concurrent perf_event_context_sched_out(). 290 */
151 */ 291 task = ctx->task;
152 task = ctx->task; 292 if (task != TASK_TOMBSTONE) {
153 raw_spin_unlock_irq(&ctx->lock); 293 if (ctx->is_active) {
154 goto again; 294 raw_spin_unlock_irq(&ctx->lock);
295 goto again;
296 }
297 func(event, NULL, ctx, data);
155 } 298 }
156 inactive(data);
157 raw_spin_unlock_irq(&ctx->lock); 299 raw_spin_unlock_irq(&ctx->lock);
158} 300}
159 301
160#define EVENT_OWNER_KERNEL ((void *) -1)
161
162static bool is_kernel_event(struct perf_event *event)
163{
164 return event->owner == EVENT_OWNER_KERNEL;
165}
166
167#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 302#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
168 PERF_FLAG_FD_OUTPUT |\ 303 PERF_FLAG_FD_OUTPUT |\
169 PERF_FLAG_PID_CGROUP |\ 304 PERF_FLAG_PID_CGROUP |\
@@ -368,28 +503,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
368 return event->clock(); 503 return event->clock();
369} 504}
370 505
371static inline struct perf_cpu_context *
372__get_cpu_context(struct perf_event_context *ctx)
373{
374 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
375}
376
377static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
378 struct perf_event_context *ctx)
379{
380 raw_spin_lock(&cpuctx->ctx.lock);
381 if (ctx)
382 raw_spin_lock(&ctx->lock);
383}
384
385static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
386 struct perf_event_context *ctx)
387{
388 if (ctx)
389 raw_spin_unlock(&ctx->lock);
390 raw_spin_unlock(&cpuctx->ctx.lock);
391}
392
393#ifdef CONFIG_CGROUP_PERF 506#ifdef CONFIG_CGROUP_PERF
394 507
395static inline bool 508static inline bool
@@ -579,13 +692,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
579 * we are holding the rcu lock 692 * we are holding the rcu lock
580 */ 693 */
581 cgrp1 = perf_cgroup_from_task(task, NULL); 694 cgrp1 = perf_cgroup_from_task(task, NULL);
582 695 cgrp2 = perf_cgroup_from_task(next, NULL);
583 /*
584 * next is NULL when called from perf_event_enable_on_exec()
585 * that will systematically cause a cgroup_switch()
586 */
587 if (next)
588 cgrp2 = perf_cgroup_from_task(next, NULL);
589 696
590 /* 697 /*
591 * only schedule out current cgroup events if we know 698 * only schedule out current cgroup events if we know
@@ -611,8 +718,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
611 * we are holding the rcu lock 718 * we are holding the rcu lock
612 */ 719 */
613 cgrp1 = perf_cgroup_from_task(task, NULL); 720 cgrp1 = perf_cgroup_from_task(task, NULL);
614
615 /* prev can never be NULL */
616 cgrp2 = perf_cgroup_from_task(prev, NULL); 721 cgrp2 = perf_cgroup_from_task(prev, NULL);
617 722
618 /* 723 /*
@@ -917,7 +1022,7 @@ static void put_ctx(struct perf_event_context *ctx)
917 if (atomic_dec_and_test(&ctx->refcount)) { 1022 if (atomic_dec_and_test(&ctx->refcount)) {
918 if (ctx->parent_ctx) 1023 if (ctx->parent_ctx)
919 put_ctx(ctx->parent_ctx); 1024 put_ctx(ctx->parent_ctx);
920 if (ctx->task) 1025 if (ctx->task && ctx->task != TASK_TOMBSTONE)
921 put_task_struct(ctx->task); 1026 put_task_struct(ctx->task);
922 call_rcu(&ctx->rcu_head, free_ctx); 1027 call_rcu(&ctx->rcu_head, free_ctx);
923 } 1028 }
@@ -934,9 +1039,8 @@ static void put_ctx(struct perf_event_context *ctx)
934 * perf_event_context::mutex nests and those are: 1039 * perf_event_context::mutex nests and those are:
935 * 1040 *
936 * - perf_event_exit_task_context() [ child , 0 ] 1041 * - perf_event_exit_task_context() [ child , 0 ]
937 * __perf_event_exit_task() 1042 * perf_event_exit_event()
938 * sync_child_event() 1043 * put_event() [ parent, 1 ]
939 * put_event() [ parent, 1 ]
940 * 1044 *
941 * - perf_event_init_context() [ parent, 0 ] 1045 * - perf_event_init_context() [ parent, 0 ]
942 * inherit_task_group() 1046 * inherit_task_group()
@@ -979,8 +1083,8 @@ static void put_ctx(struct perf_event_context *ctx)
979 * Lock order: 1083 * Lock order:
980 * task_struct::perf_event_mutex 1084 * task_struct::perf_event_mutex
981 * perf_event_context::mutex 1085 * perf_event_context::mutex
982 * perf_event_context::lock
983 * perf_event::child_mutex; 1086 * perf_event::child_mutex;
1087 * perf_event_context::lock
984 * perf_event::mmap_mutex 1088 * perf_event::mmap_mutex
985 * mmap_sem 1089 * mmap_sem
986 */ 1090 */
@@ -1078,6 +1182,7 @@ static u64 primary_event_id(struct perf_event *event)
1078 1182
1079/* 1183/*
1080 * Get the perf_event_context for a task and lock it. 1184 * Get the perf_event_context for a task and lock it.
1185 *
1081 * This has to cope with with the fact that until it is locked, 1186 * This has to cope with with the fact that until it is locked,
1082 * the context could get moved to another task. 1187 * the context could get moved to another task.
1083 */ 1188 */
@@ -1118,9 +1223,12 @@ retry:
1118 goto retry; 1223 goto retry;
1119 } 1224 }
1120 1225
1121 if (!atomic_inc_not_zero(&ctx->refcount)) { 1226 if (ctx->task == TASK_TOMBSTONE ||
1227 !atomic_inc_not_zero(&ctx->refcount)) {
1122 raw_spin_unlock(&ctx->lock); 1228 raw_spin_unlock(&ctx->lock);
1123 ctx = NULL; 1229 ctx = NULL;
1230 } else {
1231 WARN_ON_ONCE(ctx->task != task);
1124 } 1232 }
1125 } 1233 }
1126 rcu_read_unlock(); 1234 rcu_read_unlock();
@@ -1246,6 +1354,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1246static void 1354static void
1247list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1355list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1248{ 1356{
1357 lockdep_assert_held(&ctx->lock);
1358
1249 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1359 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1250 event->attach_state |= PERF_ATTACH_CONTEXT; 1360 event->attach_state |= PERF_ATTACH_CONTEXT;
1251 1361
@@ -1448,11 +1558,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1448 1558
1449 if (is_cgroup_event(event)) { 1559 if (is_cgroup_event(event)) {
1450 ctx->nr_cgroups--; 1560 ctx->nr_cgroups--;
1561 /*
1562 * Because cgroup events are always per-cpu events, this will
1563 * always be called from the right CPU.
1564 */
1451 cpuctx = __get_cpu_context(ctx); 1565 cpuctx = __get_cpu_context(ctx);
1452 /* 1566 /*
1453 * if there are no more cgroup events 1567 * If there are no more cgroup events then clear cgrp to avoid
1454 * then cler cgrp to avoid stale pointer 1568 * stale pointer in update_cgrp_time_from_cpuctx().
1455 * in update_cgrp_time_from_cpuctx()
1456 */ 1569 */
1457 if (!ctx->nr_cgroups) 1570 if (!ctx->nr_cgroups)
1458 cpuctx->cgrp = NULL; 1571 cpuctx->cgrp = NULL;
@@ -1530,45 +1643,11 @@ out:
1530 perf_event__header_size(tmp); 1643 perf_event__header_size(tmp);
1531} 1644}
1532 1645
1533/*
1534 * User event without the task.
1535 */
1536static bool is_orphaned_event(struct perf_event *event) 1646static bool is_orphaned_event(struct perf_event *event)
1537{ 1647{
1538 return event && !is_kernel_event(event) && !event->owner; 1648 return event->state == PERF_EVENT_STATE_EXIT;
1539} 1649}
1540 1650
1541/*
1542 * Event has a parent but parent's task finished and it's
1543 * alive only because of children holding refference.
1544 */
1545static bool is_orphaned_child(struct perf_event *event)
1546{
1547 return is_orphaned_event(event->parent);
1548}
1549
1550static void orphans_remove_work(struct work_struct *work);
1551
1552static void schedule_orphans_remove(struct perf_event_context *ctx)
1553{
1554 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1555 return;
1556
1557 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1558 get_ctx(ctx);
1559 ctx->orphans_remove_sched = true;
1560 }
1561}
1562
1563static int __init perf_workqueue_init(void)
1564{
1565 perf_wq = create_singlethread_workqueue("perf");
1566 WARN(!perf_wq, "failed to create perf workqueue\n");
1567 return perf_wq ? 0 : -1;
1568}
1569
1570core_initcall(perf_workqueue_init);
1571
1572static inline int pmu_filter_match(struct perf_event *event) 1651static inline int pmu_filter_match(struct perf_event *event)
1573{ 1652{
1574 struct pmu *pmu = event->pmu; 1653 struct pmu *pmu = event->pmu;
@@ -1629,9 +1708,6 @@ event_sched_out(struct perf_event *event,
1629 if (event->attr.exclusive || !cpuctx->active_oncpu) 1708 if (event->attr.exclusive || !cpuctx->active_oncpu)
1630 cpuctx->exclusive = 0; 1709 cpuctx->exclusive = 0;
1631 1710
1632 if (is_orphaned_child(event))
1633 schedule_orphans_remove(ctx);
1634
1635 perf_pmu_enable(event->pmu); 1711 perf_pmu_enable(event->pmu);
1636} 1712}
1637 1713
@@ -1655,21 +1731,8 @@ group_sched_out(struct perf_event *group_event,
1655 cpuctx->exclusive = 0; 1731 cpuctx->exclusive = 0;
1656} 1732}
1657 1733
1658struct remove_event { 1734#define DETACH_GROUP 0x01UL
1659 struct perf_event *event; 1735#define DETACH_STATE 0x02UL
1660 bool detach_group;
1661};
1662
1663static void ___perf_remove_from_context(void *info)
1664{
1665 struct remove_event *re = info;
1666 struct perf_event *event = re->event;
1667 struct perf_event_context *ctx = event->ctx;
1668
1669 if (re->detach_group)
1670 perf_group_detach(event);
1671 list_del_event(event, ctx);
1672}
1673 1736
1674/* 1737/*
1675 * Cross CPU call to remove a performance event 1738 * Cross CPU call to remove a performance event
@@ -1677,33 +1740,33 @@ static void ___perf_remove_from_context(void *info)
1677 * We disable the event on the hardware level first. After that we 1740 * We disable the event on the hardware level first. After that we
1678 * remove it from the context list. 1741 * remove it from the context list.
1679 */ 1742 */
1680static int __perf_remove_from_context(void *info) 1743static void
1744__perf_remove_from_context(struct perf_event *event,
1745 struct perf_cpu_context *cpuctx,
1746 struct perf_event_context *ctx,
1747 void *info)
1681{ 1748{
1682 struct remove_event *re = info; 1749 unsigned long flags = (unsigned long)info;
1683 struct perf_event *event = re->event;
1684 struct perf_event_context *ctx = event->ctx;
1685 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1686 1750
1687 raw_spin_lock(&ctx->lock);
1688 event_sched_out(event, cpuctx, ctx); 1751 event_sched_out(event, cpuctx, ctx);
1689 if (re->detach_group) 1752 if (flags & DETACH_GROUP)
1690 perf_group_detach(event); 1753 perf_group_detach(event);
1691 list_del_event(event, ctx); 1754 list_del_event(event, ctx);
1692 if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1755 if (flags & DETACH_STATE)
1756 event->state = PERF_EVENT_STATE_EXIT;
1757
1758 if (!ctx->nr_events && ctx->is_active) {
1693 ctx->is_active = 0; 1759 ctx->is_active = 0;
1694 cpuctx->task_ctx = NULL; 1760 if (ctx->task) {
1761 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1762 cpuctx->task_ctx = NULL;
1763 }
1695 } 1764 }
1696 raw_spin_unlock(&ctx->lock);
1697
1698 return 0;
1699} 1765}
1700 1766
1701/* 1767/*
1702 * Remove the event from a task's (or a CPU's) list of events. 1768 * Remove the event from a task's (or a CPU's) list of events.
1703 * 1769 *
1704 * CPU events are removed with a smp call. For task events we only
1705 * call when the task is on a CPU.
1706 *
1707 * If event->ctx is a cloned context, callers must make sure that 1770 * If event->ctx is a cloned context, callers must make sure that
1708 * every task struct that event->ctx->task could possibly point to 1771 * every task struct that event->ctx->task could possibly point to
1709 * remains valid. This is OK when called from perf_release since 1772 * remains valid. This is OK when called from perf_release since
@@ -1711,73 +1774,32 @@ static int __perf_remove_from_context(void *info)
1711 * When called from perf_event_exit_task, it's OK because the 1774 * When called from perf_event_exit_task, it's OK because the
1712 * context has been detached from its task. 1775 * context has been detached from its task.
1713 */ 1776 */
1714static void perf_remove_from_context(struct perf_event *event, bool detach_group) 1777static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1715{ 1778{
1716 struct perf_event_context *ctx = event->ctx; 1779 lockdep_assert_held(&event->ctx->mutex);
1717 struct remove_event re = {
1718 .event = event,
1719 .detach_group = detach_group,
1720 };
1721 1780
1722 lockdep_assert_held(&ctx->mutex); 1781 event_function_call(event, __perf_remove_from_context, (void *)flags);
1723
1724 event_function_call(event, __perf_remove_from_context,
1725 ___perf_remove_from_context, &re);
1726} 1782}
1727 1783
1728/* 1784/*
1729 * Cross CPU call to disable a performance event 1785 * Cross CPU call to disable a performance event
1730 */ 1786 */
1731int __perf_event_disable(void *info) 1787static void __perf_event_disable(struct perf_event *event,
1732{ 1788 struct perf_cpu_context *cpuctx,
1733 struct perf_event *event = info; 1789 struct perf_event_context *ctx,
1734 struct perf_event_context *ctx = event->ctx; 1790 void *info)
1735 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1736
1737 /*
1738 * If this is a per-task event, need to check whether this
1739 * event's task is the current task on this cpu.
1740 *
1741 * Can trigger due to concurrent perf_event_context_sched_out()
1742 * flipping contexts around.
1743 */
1744 if (ctx->task && cpuctx->task_ctx != ctx)
1745 return -EINVAL;
1746
1747 raw_spin_lock(&ctx->lock);
1748
1749 /*
1750 * If the event is on, turn it off.
1751 * If it is in error state, leave it in error state.
1752 */
1753 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1754 update_context_time(ctx);
1755 update_cgrp_time_from_event(event);
1756 update_group_times(event);
1757 if (event == event->group_leader)
1758 group_sched_out(event, cpuctx, ctx);
1759 else
1760 event_sched_out(event, cpuctx, ctx);
1761 event->state = PERF_EVENT_STATE_OFF;
1762 }
1763
1764 raw_spin_unlock(&ctx->lock);
1765
1766 return 0;
1767}
1768
1769void ___perf_event_disable(void *info)
1770{ 1791{
1771 struct perf_event *event = info; 1792 if (event->state < PERF_EVENT_STATE_INACTIVE)
1793 return;
1772 1794
1773 /* 1795 update_context_time(ctx);
1774 * Since we have the lock this context can't be scheduled 1796 update_cgrp_time_from_event(event);
1775 * in, so we can change the state safely. 1797 update_group_times(event);
1776 */ 1798 if (event == event->group_leader)
1777 if (event->state == PERF_EVENT_STATE_INACTIVE) { 1799 group_sched_out(event, cpuctx, ctx);
1778 update_group_times(event); 1800 else
1779 event->state = PERF_EVENT_STATE_OFF; 1801 event_sched_out(event, cpuctx, ctx);
1780 } 1802 event->state = PERF_EVENT_STATE_OFF;
1781} 1803}
1782 1804
1783/* 1805/*
@@ -1788,7 +1810,8 @@ void ___perf_event_disable(void *info)
1788 * remains valid. This condition is satisifed when called through 1810 * remains valid. This condition is satisifed when called through
1789 * perf_event_for_each_child or perf_event_for_each because they 1811 * perf_event_for_each_child or perf_event_for_each because they
1790 * hold the top-level event's child_mutex, so any descendant that 1812 * hold the top-level event's child_mutex, so any descendant that
1791 * goes to exit will block in sync_child_event. 1813 * goes to exit will block in perf_event_exit_event().
1814 *
1792 * When called from perf_pending_event it's OK because event->ctx 1815 * When called from perf_pending_event it's OK because event->ctx
1793 * is the current context on this CPU and preemption is disabled, 1816 * is the current context on this CPU and preemption is disabled,
1794 * hence we can't get into perf_event_task_sched_out for this context. 1817 * hence we can't get into perf_event_task_sched_out for this context.
@@ -1804,8 +1827,12 @@ static void _perf_event_disable(struct perf_event *event)
1804 } 1827 }
1805 raw_spin_unlock_irq(&ctx->lock); 1828 raw_spin_unlock_irq(&ctx->lock);
1806 1829
1807 event_function_call(event, __perf_event_disable, 1830 event_function_call(event, __perf_event_disable, NULL);
1808 ___perf_event_disable, event); 1831}
1832
1833void perf_event_disable_local(struct perf_event *event)
1834{
1835 event_function_local(event, __perf_event_disable, NULL);
1809} 1836}
1810 1837
1811/* 1838/*
@@ -1918,9 +1945,6 @@ event_sched_in(struct perf_event *event,
1918 if (event->attr.exclusive) 1945 if (event->attr.exclusive)
1919 cpuctx->exclusive = 1; 1946 cpuctx->exclusive = 1;
1920 1947
1921 if (is_orphaned_child(event))
1922 schedule_orphans_remove(ctx);
1923
1924out: 1948out:
1925 perf_pmu_enable(event->pmu); 1949 perf_pmu_enable(event->pmu);
1926 1950
@@ -2039,7 +2063,8 @@ static void add_event_to_ctx(struct perf_event *event,
2039 event->tstamp_stopped = tstamp; 2063 event->tstamp_stopped = tstamp;
2040} 2064}
2041 2065
2042static void task_ctx_sched_out(struct perf_event_context *ctx); 2066static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2067 struct perf_event_context *ctx);
2043static void 2068static void
2044ctx_sched_in(struct perf_event_context *ctx, 2069ctx_sched_in(struct perf_event_context *ctx,
2045 struct perf_cpu_context *cpuctx, 2070 struct perf_cpu_context *cpuctx,
@@ -2058,16 +2083,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2058 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2083 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2059} 2084}
2060 2085
2061static void ___perf_install_in_context(void *info) 2086static void ctx_resched(struct perf_cpu_context *cpuctx,
2087 struct perf_event_context *task_ctx)
2062{ 2088{
2063 struct perf_event *event = info; 2089 perf_pmu_disable(cpuctx->ctx.pmu);
2064 struct perf_event_context *ctx = event->ctx; 2090 if (task_ctx)
2065 2091 task_ctx_sched_out(cpuctx, task_ctx);
2066 /* 2092 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2067 * Since the task isn't running, its safe to add the event, us holding 2093 perf_event_sched_in(cpuctx, task_ctx, current);
2068 * the ctx->lock ensures the task won't get scheduled in. 2094 perf_pmu_enable(cpuctx->ctx.pmu);
2069 */
2070 add_event_to_ctx(event, ctx);
2071} 2095}
2072 2096
2073/* 2097/*
@@ -2077,55 +2101,31 @@ static void ___perf_install_in_context(void *info)
2077 */ 2101 */
2078static int __perf_install_in_context(void *info) 2102static int __perf_install_in_context(void *info)
2079{ 2103{
2080 struct perf_event *event = info; 2104 struct perf_event_context *ctx = info;
2081 struct perf_event_context *ctx = event->ctx;
2082 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2105 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2083 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2106 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2084 struct task_struct *task = current;
2085
2086 perf_ctx_lock(cpuctx, task_ctx);
2087 perf_pmu_disable(cpuctx->ctx.pmu);
2088
2089 /*
2090 * If there was an active task_ctx schedule it out.
2091 */
2092 if (task_ctx)
2093 task_ctx_sched_out(task_ctx);
2094 2107
2095 /* 2108 raw_spin_lock(&cpuctx->ctx.lock);
2096 * If the context we're installing events in is not the 2109 if (ctx->task) {
2097 * active task_ctx, flip them.
2098 */
2099 if (ctx->task && task_ctx != ctx) {
2100 if (task_ctx)
2101 raw_spin_unlock(&task_ctx->lock);
2102 raw_spin_lock(&ctx->lock); 2110 raw_spin_lock(&ctx->lock);
2111 /*
2112 * If we hit the 'wrong' task, we've since scheduled and
2113 * everything should be sorted, nothing to do!
2114 */
2103 task_ctx = ctx; 2115 task_ctx = ctx;
2104 } 2116 if (ctx->task != current)
2117 goto unlock;
2105 2118
2106 if (task_ctx) { 2119 /*
2107 cpuctx->task_ctx = task_ctx; 2120 * If task_ctx is set, it had better be to us.
2108 task = task_ctx->task; 2121 */
2122 WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
2123 } else if (task_ctx) {
2124 raw_spin_lock(&task_ctx->lock);
2109 } 2125 }
2110 2126
2111 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2127 ctx_resched(cpuctx, task_ctx);
2112 2128unlock:
2113 update_context_time(ctx);
2114 /*
2115 * update cgrp time only if current cgrp
2116 * matches event->cgrp. Must be done before
2117 * calling add_event_to_ctx()
2118 */
2119 update_cgrp_time_from_event(event);
2120
2121 add_event_to_ctx(event, ctx);
2122
2123 /*
2124 * Schedule everything back in
2125 */
2126 perf_event_sched_in(cpuctx, task_ctx, task);
2127
2128 perf_pmu_enable(cpuctx->ctx.pmu);
2129 perf_ctx_unlock(cpuctx, task_ctx); 2129 perf_ctx_unlock(cpuctx, task_ctx);
2130 2130
2131 return 0; 2131 return 0;
@@ -2133,27 +2133,54 @@ static int __perf_install_in_context(void *info)
2133 2133
2134/* 2134/*
2135 * Attach a performance event to a context 2135 * Attach a performance event to a context
2136 *
2137 * First we add the event to the list with the hardware enable bit
2138 * in event->hw_config cleared.
2139 *
2140 * If the event is attached to a task which is on a CPU we use a smp
2141 * call to enable it in the task context. The task might have been
2142 * scheduled away, but we check this in the smp call again.
2143 */ 2136 */
2144static void 2137static void
2145perf_install_in_context(struct perf_event_context *ctx, 2138perf_install_in_context(struct perf_event_context *ctx,
2146 struct perf_event *event, 2139 struct perf_event *event,
2147 int cpu) 2140 int cpu)
2148{ 2141{
2142 struct task_struct *task = NULL;
2143
2149 lockdep_assert_held(&ctx->mutex); 2144 lockdep_assert_held(&ctx->mutex);
2150 2145
2151 event->ctx = ctx; 2146 event->ctx = ctx;
2152 if (event->cpu != -1) 2147 if (event->cpu != -1)
2153 event->cpu = cpu; 2148 event->cpu = cpu;
2154 2149
2155 event_function_call(event, __perf_install_in_context, 2150 /*
2156 ___perf_install_in_context, event); 2151 * Installing events is tricky because we cannot rely on ctx->is_active
2152 * to be set in case this is the nr_events 0 -> 1 transition.
2153 *
2154 * So what we do is we add the event to the list here, which will allow
2155 * a future context switch to DTRT and then send a racy IPI. If the IPI
2156 * fails to hit the right task, this means a context switch must have
2157 * happened and that will have taken care of business.
2158 */
2159 raw_spin_lock_irq(&ctx->lock);
2160 task = ctx->task;
2161 /*
2162 * Worse, we cannot even rely on the ctx actually existing anymore. If
2163 * between find_get_context() and perf_install_in_context() the task
2164 * went through perf_event_exit_task() its dead and we should not be
2165 * adding new events.
2166 */
2167 if (task == TASK_TOMBSTONE) {
2168 raw_spin_unlock_irq(&ctx->lock);
2169 return;
2170 }
2171 update_context_time(ctx);
2172 /*
2173 * Update cgrp time only if current cgrp matches event->cgrp.
2174 * Must be done before calling add_event_to_ctx().
2175 */
2176 update_cgrp_time_from_event(event);
2177 add_event_to_ctx(event, ctx);
2178 raw_spin_unlock_irq(&ctx->lock);
2179
2180 if (task)
2181 task_function_call(task, __perf_install_in_context, ctx);
2182 else
2183 cpu_function_call(cpu, __perf_install_in_context, ctx);
2157} 2184}
2158 2185
2159/* 2186/*
@@ -2180,43 +2207,30 @@ static void __perf_event_mark_enabled(struct perf_event *event)
2180/* 2207/*
2181 * Cross CPU call to enable a performance event 2208 * Cross CPU call to enable a performance event
2182 */ 2209 */
2183static int __perf_event_enable(void *info) 2210static void __perf_event_enable(struct perf_event *event,
2211 struct perf_cpu_context *cpuctx,
2212 struct perf_event_context *ctx,
2213 void *info)
2184{ 2214{
2185 struct perf_event *event = info;
2186 struct perf_event_context *ctx = event->ctx;
2187 struct perf_event *leader = event->group_leader; 2215 struct perf_event *leader = event->group_leader;
2188 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2216 struct perf_event_context *task_ctx;
2189 int err;
2190 2217
2191 /* 2218 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2192 * There's a time window between 'ctx->is_active' check 2219 event->state <= PERF_EVENT_STATE_ERROR)
2193 * in perf_event_enable function and this place having: 2220 return;
2194 * - IRQs on
2195 * - ctx->lock unlocked
2196 *
2197 * where the task could be killed and 'ctx' deactivated
2198 * by perf_event_exit_task.
2199 */
2200 if (!ctx->is_active)
2201 return -EINVAL;
2202 2221
2203 raw_spin_lock(&ctx->lock);
2204 update_context_time(ctx); 2222 update_context_time(ctx);
2205
2206 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2207 goto unlock;
2208
2209 /*
2210 * set current task's cgroup time reference point
2211 */
2212 perf_cgroup_set_timestamp(current, ctx);
2213
2214 __perf_event_mark_enabled(event); 2223 __perf_event_mark_enabled(event);
2215 2224
2225 if (!ctx->is_active)
2226 return;
2227
2216 if (!event_filter_match(event)) { 2228 if (!event_filter_match(event)) {
2217 if (is_cgroup_event(event)) 2229 if (is_cgroup_event(event)) {
2230 perf_cgroup_set_timestamp(current, ctx); // XXX ?
2218 perf_cgroup_defer_enabled(event); 2231 perf_cgroup_defer_enabled(event);
2219 goto unlock; 2232 }
2233 return;
2220 } 2234 }
2221 2235
2222 /* 2236 /*
@@ -2224,41 +2238,13 @@ static int __perf_event_enable(void *info)
2224 * then don't put it on unless the group is on. 2238 * then don't put it on unless the group is on.
2225 */ 2239 */
2226 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2240 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2227 goto unlock; 2241 return;
2228
2229 if (!group_can_go_on(event, cpuctx, 1)) {
2230 err = -EEXIST;
2231 } else {
2232 if (event == leader)
2233 err = group_sched_in(event, cpuctx, ctx);
2234 else
2235 err = event_sched_in(event, cpuctx, ctx);
2236 }
2237
2238 if (err) {
2239 /*
2240 * If this event can't go on and it's part of a
2241 * group, then the whole group has to come off.
2242 */
2243 if (leader != event) {
2244 group_sched_out(leader, cpuctx, ctx);
2245 perf_mux_hrtimer_restart(cpuctx);
2246 }
2247 if (leader->attr.pinned) {
2248 update_group_times(leader);
2249 leader->state = PERF_EVENT_STATE_ERROR;
2250 }
2251 }
2252 2242
2253unlock: 2243 task_ctx = cpuctx->task_ctx;
2254 raw_spin_unlock(&ctx->lock); 2244 if (ctx->task)
2245 WARN_ON_ONCE(task_ctx != ctx);
2255 2246
2256 return 0; 2247 ctx_resched(cpuctx, task_ctx);
2257}
2258
2259void ___perf_event_enable(void *info)
2260{
2261 __perf_event_mark_enabled((struct perf_event *)info);
2262} 2248}
2263 2249
2264/* 2250/*
@@ -2275,7 +2261,8 @@ static void _perf_event_enable(struct perf_event *event)
2275 struct perf_event_context *ctx = event->ctx; 2261 struct perf_event_context *ctx = event->ctx;
2276 2262
2277 raw_spin_lock_irq(&ctx->lock); 2263 raw_spin_lock_irq(&ctx->lock);
2278 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 2264 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2265 event->state < PERF_EVENT_STATE_ERROR) {
2279 raw_spin_unlock_irq(&ctx->lock); 2266 raw_spin_unlock_irq(&ctx->lock);
2280 return; 2267 return;
2281 } 2268 }
@@ -2291,8 +2278,7 @@ static void _perf_event_enable(struct perf_event *event)
2291 event->state = PERF_EVENT_STATE_OFF; 2278 event->state = PERF_EVENT_STATE_OFF;
2292 raw_spin_unlock_irq(&ctx->lock); 2279 raw_spin_unlock_irq(&ctx->lock);
2293 2280
2294 event_function_call(event, __perf_event_enable, 2281 event_function_call(event, __perf_event_enable, NULL);
2295 ___perf_event_enable, event);
2296} 2282}
2297 2283
2298/* 2284/*
@@ -2342,12 +2328,27 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2342 struct perf_cpu_context *cpuctx, 2328 struct perf_cpu_context *cpuctx,
2343 enum event_type_t event_type) 2329 enum event_type_t event_type)
2344{ 2330{
2345 struct perf_event *event;
2346 int is_active = ctx->is_active; 2331 int is_active = ctx->is_active;
2332 struct perf_event *event;
2347 2333
2348 ctx->is_active &= ~event_type; 2334 lockdep_assert_held(&ctx->lock);
2349 if (likely(!ctx->nr_events)) 2335
2336 if (likely(!ctx->nr_events)) {
2337 /*
2338 * See __perf_remove_from_context().
2339 */
2340 WARN_ON_ONCE(ctx->is_active);
2341 if (ctx->task)
2342 WARN_ON_ONCE(cpuctx->task_ctx);
2350 return; 2343 return;
2344 }
2345
2346 ctx->is_active &= ~event_type;
2347 if (ctx->task) {
2348 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2349 if (!ctx->is_active)
2350 cpuctx->task_ctx = NULL;
2351 }
2351 2352
2352 update_context_time(ctx); 2353 update_context_time(ctx);
2353 update_cgrp_time_from_cpuctx(cpuctx); 2354 update_cgrp_time_from_cpuctx(cpuctx);
@@ -2518,17 +2519,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2518 raw_spin_lock(&ctx->lock); 2519 raw_spin_lock(&ctx->lock);
2519 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2520 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2520 if (context_equiv(ctx, next_ctx)) { 2521 if (context_equiv(ctx, next_ctx)) {
2521 /* 2522 WRITE_ONCE(ctx->task, next);
2522 * XXX do we need a memory barrier of sorts 2523 WRITE_ONCE(next_ctx->task, task);
2523 * wrt to rcu_dereference() of perf_event_ctxp
2524 */
2525 task->perf_event_ctxp[ctxn] = next_ctx;
2526 next->perf_event_ctxp[ctxn] = ctx;
2527 ctx->task = next;
2528 next_ctx->task = task;
2529 2524
2530 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2525 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2531 2526
2527 /*
2528 * RCU_INIT_POINTER here is safe because we've not
2529 * modified the ctx and the above modification of
2530 * ctx->task and ctx->task_ctx_data are immaterial
2531 * since those values are always verified under
2532 * ctx->lock which we're now holding.
2533 */
2534 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2535 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2536
2532 do_switch = 0; 2537 do_switch = 0;
2533 2538
2534 perf_event_sync_stat(ctx, next_ctx); 2539 perf_event_sync_stat(ctx, next_ctx);
@@ -2541,8 +2546,7 @@ unlock:
2541 2546
2542 if (do_switch) { 2547 if (do_switch) {
2543 raw_spin_lock(&ctx->lock); 2548 raw_spin_lock(&ctx->lock);
2544 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2549 task_ctx_sched_out(cpuctx, ctx);
2545 cpuctx->task_ctx = NULL;
2546 raw_spin_unlock(&ctx->lock); 2550 raw_spin_unlock(&ctx->lock);
2547 } 2551 }
2548} 2552}
@@ -2637,10 +2641,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
2637 perf_cgroup_sched_out(task, next); 2641 perf_cgroup_sched_out(task, next);
2638} 2642}
2639 2643
2640static void task_ctx_sched_out(struct perf_event_context *ctx) 2644static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2645 struct perf_event_context *ctx)
2641{ 2646{
2642 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2643
2644 if (!cpuctx->task_ctx) 2647 if (!cpuctx->task_ctx)
2645 return; 2648 return;
2646 2649
@@ -2648,7 +2651,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx)
2648 return; 2651 return;
2649 2652
2650 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2653 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2651 cpuctx->task_ctx = NULL;
2652} 2654}
2653 2655
2654/* 2656/*
@@ -2725,13 +2727,22 @@ ctx_sched_in(struct perf_event_context *ctx,
2725 enum event_type_t event_type, 2727 enum event_type_t event_type,
2726 struct task_struct *task) 2728 struct task_struct *task)
2727{ 2729{
2728 u64 now;
2729 int is_active = ctx->is_active; 2730 int is_active = ctx->is_active;
2731 u64 now;
2732
2733 lockdep_assert_held(&ctx->lock);
2730 2734
2731 ctx->is_active |= event_type;
2732 if (likely(!ctx->nr_events)) 2735 if (likely(!ctx->nr_events))
2733 return; 2736 return;
2734 2737
2738 ctx->is_active |= event_type;
2739 if (ctx->task) {
2740 if (!is_active)
2741 cpuctx->task_ctx = ctx;
2742 else
2743 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2744 }
2745
2735 now = perf_clock(); 2746 now = perf_clock();
2736 ctx->timestamp = now; 2747 ctx->timestamp = now;
2737 perf_cgroup_set_timestamp(task, ctx); 2748 perf_cgroup_set_timestamp(task, ctx);
@@ -2773,12 +2784,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2773 * cpu flexible, task flexible. 2784 * cpu flexible, task flexible.
2774 */ 2785 */
2775 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2786 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2776 2787 perf_event_sched_in(cpuctx, ctx, task);
2777 if (ctx->nr_events)
2778 cpuctx->task_ctx = ctx;
2779
2780 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2781
2782 perf_pmu_enable(ctx->pmu); 2788 perf_pmu_enable(ctx->pmu);
2783 perf_ctx_unlock(cpuctx, ctx); 2789 perf_ctx_unlock(cpuctx, ctx);
2784} 2790}
@@ -2800,6 +2806,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2800 struct perf_event_context *ctx; 2806 struct perf_event_context *ctx;
2801 int ctxn; 2807 int ctxn;
2802 2808
2809 /*
2810 * If cgroup events exist on this CPU, then we need to check if we have
2811 * to switch in PMU state; cgroup event are system-wide mode only.
2812 *
2813 * Since cgroup events are CPU events, we must schedule these in before
2814 * we schedule in the task events.
2815 */
2816 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2817 perf_cgroup_sched_in(prev, task);
2818
2803 for_each_task_context_nr(ctxn) { 2819 for_each_task_context_nr(ctxn) {
2804 ctx = task->perf_event_ctxp[ctxn]; 2820 ctx = task->perf_event_ctxp[ctxn];
2805 if (likely(!ctx)) 2821 if (likely(!ctx))
@@ -2807,13 +2823,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2807 2823
2808 perf_event_context_sched_in(ctx, task); 2824 perf_event_context_sched_in(ctx, task);
2809 } 2825 }
2810 /*
2811 * if cgroup events exist on this CPU, then we need
2812 * to check if we have to switch in PMU state.
2813 * cgroup event are system-wide mode only
2814 */
2815 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2816 perf_cgroup_sched_in(prev, task);
2817 2826
2818 if (atomic_read(&nr_switch_events)) 2827 if (atomic_read(&nr_switch_events))
2819 perf_event_switch(task, prev, true); 2828 perf_event_switch(task, prev, true);
@@ -3099,46 +3108,30 @@ static int event_enable_on_exec(struct perf_event *event,
3099static void perf_event_enable_on_exec(int ctxn) 3108static void perf_event_enable_on_exec(int ctxn)
3100{ 3109{
3101 struct perf_event_context *ctx, *clone_ctx = NULL; 3110 struct perf_event_context *ctx, *clone_ctx = NULL;
3111 struct perf_cpu_context *cpuctx;
3102 struct perf_event *event; 3112 struct perf_event *event;
3103 unsigned long flags; 3113 unsigned long flags;
3104 int enabled = 0; 3114 int enabled = 0;
3105 int ret;
3106 3115
3107 local_irq_save(flags); 3116 local_irq_save(flags);
3108 ctx = current->perf_event_ctxp[ctxn]; 3117 ctx = current->perf_event_ctxp[ctxn];
3109 if (!ctx || !ctx->nr_events) 3118 if (!ctx || !ctx->nr_events)
3110 goto out; 3119 goto out;
3111 3120
3112 /* 3121 cpuctx = __get_cpu_context(ctx);
3113 * We must ctxsw out cgroup events to avoid conflict 3122 perf_ctx_lock(cpuctx, ctx);
3114 * when invoking perf_task_event_sched_in() later on 3123 list_for_each_entry(event, &ctx->event_list, event_entry)
3115 * in this function. Otherwise we end up trying to 3124 enabled |= event_enable_on_exec(event, ctx);
3116 * ctxswin cgroup events which are already scheduled
3117 * in.
3118 */
3119 perf_cgroup_sched_out(current, NULL);
3120
3121 raw_spin_lock(&ctx->lock);
3122 task_ctx_sched_out(ctx);
3123
3124 list_for_each_entry(event, &ctx->event_list, event_entry) {
3125 ret = event_enable_on_exec(event, ctx);
3126 if (ret)
3127 enabled = 1;
3128 }
3129 3125
3130 /* 3126 /*
3131 * Unclone this context if we enabled any event. 3127 * Unclone and reschedule this context if we enabled any event.
3132 */ 3128 */
3133 if (enabled) 3129 if (enabled) {
3134 clone_ctx = unclone_ctx(ctx); 3130 clone_ctx = unclone_ctx(ctx);
3131 ctx_resched(cpuctx, ctx);
3132 }
3133 perf_ctx_unlock(cpuctx, ctx);
3135 3134
3136 raw_spin_unlock(&ctx->lock);
3137
3138 /*
3139 * Also calls ctxswin for cgroup events, if any:
3140 */
3141 perf_event_context_sched_in(ctx, ctx->task);
3142out: 3135out:
3143 local_irq_restore(flags); 3136 local_irq_restore(flags);
3144 3137
@@ -3334,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3334 INIT_LIST_HEAD(&ctx->flexible_groups); 3327 INIT_LIST_HEAD(&ctx->flexible_groups);
3335 INIT_LIST_HEAD(&ctx->event_list); 3328 INIT_LIST_HEAD(&ctx->event_list);
3336 atomic_set(&ctx->refcount, 1); 3329 atomic_set(&ctx->refcount, 1);
3337 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3338} 3330}
3339 3331
3340static struct perf_event_context * 3332static struct perf_event_context *
@@ -3521,11 +3513,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
3521 3513
3522static void unaccount_event(struct perf_event *event) 3514static void unaccount_event(struct perf_event *event)
3523{ 3515{
3516 bool dec = false;
3517
3524 if (event->parent) 3518 if (event->parent)
3525 return; 3519 return;
3526 3520
3527 if (event->attach_state & PERF_ATTACH_TASK) 3521 if (event->attach_state & PERF_ATTACH_TASK)
3528 static_key_slow_dec_deferred(&perf_sched_events); 3522 dec = true;
3529 if (event->attr.mmap || event->attr.mmap_data) 3523 if (event->attr.mmap || event->attr.mmap_data)
3530 atomic_dec(&nr_mmap_events); 3524 atomic_dec(&nr_mmap_events);
3531 if (event->attr.comm) 3525 if (event->attr.comm)
@@ -3535,12 +3529,15 @@ static void unaccount_event(struct perf_event *event)
3535 if (event->attr.freq) 3529 if (event->attr.freq)
3536 atomic_dec(&nr_freq_events); 3530 atomic_dec(&nr_freq_events);
3537 if (event->attr.context_switch) { 3531 if (event->attr.context_switch) {
3538 static_key_slow_dec_deferred(&perf_sched_events); 3532 dec = true;
3539 atomic_dec(&nr_switch_events); 3533 atomic_dec(&nr_switch_events);
3540 } 3534 }
3541 if (is_cgroup_event(event)) 3535 if (is_cgroup_event(event))
3542 static_key_slow_dec_deferred(&perf_sched_events); 3536 dec = true;
3543 if (has_branch_stack(event)) 3537 if (has_branch_stack(event))
3538 dec = true;
3539
3540 if (dec)
3544 static_key_slow_dec_deferred(&perf_sched_events); 3541 static_key_slow_dec_deferred(&perf_sched_events);
3545 3542
3546 unaccount_event_cpu(event, event->cpu); 3543 unaccount_event_cpu(event, event->cpu);
@@ -3556,7 +3553,7 @@ static void unaccount_event(struct perf_event *event)
3556 * 3) two matching events on the same context. 3553 * 3) two matching events on the same context.
3557 * 3554 *
3558 * The former two cases are handled in the allocation path (perf_event_alloc(), 3555 * The former two cases are handled in the allocation path (perf_event_alloc(),
3559 * __free_event()), the latter -- before the first perf_install_in_context(). 3556 * _free_event()), the latter -- before the first perf_install_in_context().
3560 */ 3557 */
3561static int exclusive_event_init(struct perf_event *event) 3558static int exclusive_event_init(struct perf_event *event)
3562{ 3559{
@@ -3631,29 +3628,6 @@ static bool exclusive_event_installable(struct perf_event *event,
3631 return true; 3628 return true;
3632} 3629}
3633 3630
3634static void __free_event(struct perf_event *event)
3635{
3636 if (!event->parent) {
3637 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3638 put_callchain_buffers();
3639 }
3640
3641 perf_event_free_bpf_prog(event);
3642
3643 if (event->destroy)
3644 event->destroy(event);
3645
3646 if (event->ctx)
3647 put_ctx(event->ctx);
3648
3649 if (event->pmu) {
3650 exclusive_event_destroy(event);
3651 module_put(event->pmu->module);
3652 }
3653
3654 call_rcu(&event->rcu_head, free_event_rcu);
3655}
3656
3657static void _free_event(struct perf_event *event) 3631static void _free_event(struct perf_event *event)
3658{ 3632{
3659 irq_work_sync(&event->pending); 3633 irq_work_sync(&event->pending);
@@ -3675,7 +3649,25 @@ static void _free_event(struct perf_event *event)
3675 if (is_cgroup_event(event)) 3649 if (is_cgroup_event(event))
3676 perf_detach_cgroup(event); 3650 perf_detach_cgroup(event);
3677 3651
3678 __free_event(event); 3652 if (!event->parent) {
3653 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3654 put_callchain_buffers();
3655 }
3656
3657 perf_event_free_bpf_prog(event);
3658
3659 if (event->destroy)
3660 event->destroy(event);
3661
3662 if (event->ctx)
3663 put_ctx(event->ctx);
3664
3665 if (event->pmu) {
3666 exclusive_event_destroy(event);
3667 module_put(event->pmu->module);
3668 }
3669
3670 call_rcu(&event->rcu_head, free_event_rcu);
3679} 3671}
3680 3672
3681/* 3673/*
@@ -3702,14 +3694,13 @@ static void perf_remove_from_owner(struct perf_event *event)
3702 struct task_struct *owner; 3694 struct task_struct *owner;
3703 3695
3704 rcu_read_lock(); 3696 rcu_read_lock();
3705 owner = ACCESS_ONCE(event->owner);
3706 /* 3697 /*
3707 * Matches the smp_wmb() in perf_event_exit_task(). If we observe 3698 * Matches the smp_store_release() in perf_event_exit_task(). If we
3708 * !owner it means the list deletion is complete and we can indeed 3699 * observe !owner it means the list deletion is complete and we can
3709 * free this event, otherwise we need to serialize on 3700 * indeed free this event, otherwise we need to serialize on
3710 * owner->perf_event_mutex. 3701 * owner->perf_event_mutex.
3711 */ 3702 */
3712 smp_read_barrier_depends(); 3703 owner = lockless_dereference(event->owner);
3713 if (owner) { 3704 if (owner) {
3714 /* 3705 /*
3715 * Since delayed_put_task_struct() also drops the last 3706 * Since delayed_put_task_struct() also drops the last
@@ -3737,8 +3728,10 @@ static void perf_remove_from_owner(struct perf_event *event)
3737 * ensured they're done, and we can proceed with freeing the 3728 * ensured they're done, and we can proceed with freeing the
3738 * event. 3729 * event.
3739 */ 3730 */
3740 if (event->owner) 3731 if (event->owner) {
3741 list_del_init(&event->owner_entry); 3732 list_del_init(&event->owner_entry);
3733 smp_store_release(&event->owner, NULL);
3734 }
3742 mutex_unlock(&owner->perf_event_mutex); 3735 mutex_unlock(&owner->perf_event_mutex);
3743 put_task_struct(owner); 3736 put_task_struct(owner);
3744 } 3737 }
@@ -3746,36 +3739,98 @@ static void perf_remove_from_owner(struct perf_event *event)
3746 3739
3747static void put_event(struct perf_event *event) 3740static void put_event(struct perf_event *event)
3748{ 3741{
3749 struct perf_event_context *ctx;
3750
3751 if (!atomic_long_dec_and_test(&event->refcount)) 3742 if (!atomic_long_dec_and_test(&event->refcount))
3752 return; 3743 return;
3753 3744
3745 _free_event(event);
3746}
3747
3748/*
3749 * Kill an event dead; while event:refcount will preserve the event
3750 * object, it will not preserve its functionality. Once the last 'user'
3751 * gives up the object, we'll destroy the thing.
3752 */
3753int perf_event_release_kernel(struct perf_event *event)
3754{
3755 struct perf_event_context *ctx;
3756 struct perf_event *child, *tmp;
3757
3754 if (!is_kernel_event(event)) 3758 if (!is_kernel_event(event))
3755 perf_remove_from_owner(event); 3759 perf_remove_from_owner(event);
3756 3760
3761 ctx = perf_event_ctx_lock(event);
3762 WARN_ON_ONCE(ctx->parent_ctx);
3763 perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
3764 perf_event_ctx_unlock(event, ctx);
3765
3757 /* 3766 /*
3758 * There are two ways this annotation is useful: 3767 * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
3768 * either from the above perf_remove_from_context() or through
3769 * perf_event_exit_event().
3759 * 3770 *
3760 * 1) there is a lock recursion from perf_event_exit_task 3771 * Therefore, anybody acquiring event->child_mutex after the below
3761 * see the comment there. 3772 * loop _must_ also see this, most importantly inherit_event() which
3773 * will avoid placing more children on the list.
3762 * 3774 *
3763 * 2) there is a lock-inversion with mmap_sem through 3775 * Thus this guarantees that we will in fact observe and kill _ALL_
3764 * perf_read_group(), which takes faults while 3776 * child events.
3765 * holding ctx->mutex, however this is called after
3766 * the last filedesc died, so there is no possibility
3767 * to trigger the AB-BA case.
3768 */ 3777 */
3769 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); 3778 WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
3770 WARN_ON_ONCE(ctx->parent_ctx);
3771 perf_remove_from_context(event, true);
3772 perf_event_ctx_unlock(event, ctx);
3773 3779
3774 _free_event(event); 3780again:
3775} 3781 mutex_lock(&event->child_mutex);
3782 list_for_each_entry(child, &event->child_list, child_list) {
3776 3783
3777int perf_event_release_kernel(struct perf_event *event) 3784 /*
3778{ 3785 * Cannot change, child events are not migrated, see the
3786 * comment with perf_event_ctx_lock_nested().
3787 */
3788 ctx = lockless_dereference(child->ctx);
3789 /*
3790 * Since child_mutex nests inside ctx::mutex, we must jump
3791 * through hoops. We start by grabbing a reference on the ctx.
3792 *
3793 * Since the event cannot get freed while we hold the
3794 * child_mutex, the context must also exist and have a !0
3795 * reference count.
3796 */
3797 get_ctx(ctx);
3798
3799 /*
3800 * Now that we have a ctx ref, we can drop child_mutex, and
3801 * acquire ctx::mutex without fear of it going away. Then we
3802 * can re-acquire child_mutex.
3803 */
3804 mutex_unlock(&event->child_mutex);
3805 mutex_lock(&ctx->mutex);
3806 mutex_lock(&event->child_mutex);
3807
3808 /*
3809 * Now that we hold ctx::mutex and child_mutex, revalidate our
3810 * state, if child is still the first entry, it didn't get freed
3811 * and we can continue doing so.
3812 */
3813 tmp = list_first_entry_or_null(&event->child_list,
3814 struct perf_event, child_list);
3815 if (tmp == child) {
3816 perf_remove_from_context(child, DETACH_GROUP);
3817 list_del(&child->child_list);
3818 free_event(child);
3819 /*
3820 * This matches the refcount bump in inherit_event();
3821 * this can't be the last reference.
3822 */
3823 put_event(event);
3824 }
3825
3826 mutex_unlock(&event->child_mutex);
3827 mutex_unlock(&ctx->mutex);
3828 put_ctx(ctx);
3829 goto again;
3830 }
3831 mutex_unlock(&event->child_mutex);
3832
3833 /* Must be the last reference */
3779 put_event(event); 3834 put_event(event);
3780 return 0; 3835 return 0;
3781} 3836}
@@ -3786,46 +3841,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3786 */ 3841 */
3787static int perf_release(struct inode *inode, struct file *file) 3842static int perf_release(struct inode *inode, struct file *file)
3788{ 3843{
3789 put_event(file->private_data); 3844 perf_event_release_kernel(file->private_data);
3790 return 0; 3845 return 0;
3791} 3846}
3792 3847
3793/*
3794 * Remove all orphanes events from the context.
3795 */
3796static void orphans_remove_work(struct work_struct *work)
3797{
3798 struct perf_event_context *ctx;
3799 struct perf_event *event, *tmp;
3800
3801 ctx = container_of(work, struct perf_event_context,
3802 orphans_remove.work);
3803
3804 mutex_lock(&ctx->mutex);
3805 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3806 struct perf_event *parent_event = event->parent;
3807
3808 if (!is_orphaned_child(event))
3809 continue;
3810
3811 perf_remove_from_context(event, true);
3812
3813 mutex_lock(&parent_event->child_mutex);
3814 list_del_init(&event->child_list);
3815 mutex_unlock(&parent_event->child_mutex);
3816
3817 free_event(event);
3818 put_event(parent_event);
3819 }
3820
3821 raw_spin_lock_irq(&ctx->lock);
3822 ctx->orphans_remove_sched = false;
3823 raw_spin_unlock_irq(&ctx->lock);
3824 mutex_unlock(&ctx->mutex);
3825
3826 put_ctx(ctx);
3827}
3828
3829u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3848u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3830{ 3849{
3831 struct perf_event *child; 3850 struct perf_event *child;
@@ -4054,7 +4073,7 @@ static void _perf_event_reset(struct perf_event *event)
4054/* 4073/*
4055 * Holding the top-level event's child_mutex means that any 4074 * Holding the top-level event's child_mutex means that any
4056 * descendant process that has inherited this event will block 4075 * descendant process that has inherited this event will block
4057 * in sync_child_event if it goes to exit, thus satisfying the 4076 * in perf_event_exit_event() if it goes to exit, thus satisfying the
4058 * task existence requirements of perf_event_enable/disable. 4077 * task existence requirements of perf_event_enable/disable.
4059 */ 4078 */
4060static void perf_event_for_each_child(struct perf_event *event, 4079static void perf_event_for_each_child(struct perf_event *event,
@@ -4086,36 +4105,14 @@ static void perf_event_for_each(struct perf_event *event,
4086 perf_event_for_each_child(sibling, func); 4105 perf_event_for_each_child(sibling, func);
4087} 4106}
4088 4107
4089struct period_event { 4108static void __perf_event_period(struct perf_event *event,
4090 struct perf_event *event; 4109 struct perf_cpu_context *cpuctx,
4091 u64 value; 4110 struct perf_event_context *ctx,
4092}; 4111 void *info)
4093
4094static void ___perf_event_period(void *info)
4095{
4096 struct period_event *pe = info;
4097 struct perf_event *event = pe->event;
4098 u64 value = pe->value;
4099
4100 if (event->attr.freq) {
4101 event->attr.sample_freq = value;
4102 } else {
4103 event->attr.sample_period = value;
4104 event->hw.sample_period = value;
4105 }
4106
4107 local64_set(&event->hw.period_left, 0);
4108}
4109
4110static int __perf_event_period(void *info)
4111{ 4112{
4112 struct period_event *pe = info; 4113 u64 value = *((u64 *)info);
4113 struct perf_event *event = pe->event;
4114 struct perf_event_context *ctx = event->ctx;
4115 u64 value = pe->value;
4116 bool active; 4114 bool active;
4117 4115
4118 raw_spin_lock(&ctx->lock);
4119 if (event->attr.freq) { 4116 if (event->attr.freq) {
4120 event->attr.sample_freq = value; 4117 event->attr.sample_freq = value;
4121 } else { 4118 } else {
@@ -4135,14 +4132,10 @@ static int __perf_event_period(void *info)
4135 event->pmu->start(event, PERF_EF_RELOAD); 4132 event->pmu->start(event, PERF_EF_RELOAD);
4136 perf_pmu_enable(ctx->pmu); 4133 perf_pmu_enable(ctx->pmu);
4137 } 4134 }
4138 raw_spin_unlock(&ctx->lock);
4139
4140 return 0;
4141} 4135}
4142 4136
4143static int perf_event_period(struct perf_event *event, u64 __user *arg) 4137static int perf_event_period(struct perf_event *event, u64 __user *arg)
4144{ 4138{
4145 struct period_event pe = { .event = event, };
4146 u64 value; 4139 u64 value;
4147 4140
4148 if (!is_sampling_event(event)) 4141 if (!is_sampling_event(event))
@@ -4157,10 +4150,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
4157 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 4150 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4158 return -EINVAL; 4151 return -EINVAL;
4159 4152
4160 pe.value = value; 4153 event_function_call(event, __perf_event_period, &value);
4161
4162 event_function_call(event, __perf_event_period,
4163 ___perf_event_period, &pe);
4164 4154
4165 return 0; 4155 return 0;
4166} 4156}
@@ -4932,7 +4922,7 @@ static void perf_pending_event(struct irq_work *entry)
4932 4922
4933 if (event->pending_disable) { 4923 if (event->pending_disable) {
4934 event->pending_disable = 0; 4924 event->pending_disable = 0;
4935 __perf_event_disable(event); 4925 perf_event_disable_local(event);
4936 } 4926 }
4937 4927
4938 if (event->pending_wakeup) { 4928 if (event->pending_wakeup) {
@@ -7753,11 +7743,13 @@ static void account_event_cpu(struct perf_event *event, int cpu)
7753 7743
7754static void account_event(struct perf_event *event) 7744static void account_event(struct perf_event *event)
7755{ 7745{
7746 bool inc = false;
7747
7756 if (event->parent) 7748 if (event->parent)
7757 return; 7749 return;
7758 7750
7759 if (event->attach_state & PERF_ATTACH_TASK) 7751 if (event->attach_state & PERF_ATTACH_TASK)
7760 static_key_slow_inc(&perf_sched_events.key); 7752 inc = true;
7761 if (event->attr.mmap || event->attr.mmap_data) 7753 if (event->attr.mmap || event->attr.mmap_data)
7762 atomic_inc(&nr_mmap_events); 7754 atomic_inc(&nr_mmap_events);
7763 if (event->attr.comm) 7755 if (event->attr.comm)
@@ -7770,11 +7762,14 @@ static void account_event(struct perf_event *event)
7770 } 7762 }
7771 if (event->attr.context_switch) { 7763 if (event->attr.context_switch) {
7772 atomic_inc(&nr_switch_events); 7764 atomic_inc(&nr_switch_events);
7773 static_key_slow_inc(&perf_sched_events.key); 7765 inc = true;
7774 } 7766 }
7775 if (has_branch_stack(event)) 7767 if (has_branch_stack(event))
7776 static_key_slow_inc(&perf_sched_events.key); 7768 inc = true;
7777 if (is_cgroup_event(event)) 7769 if (is_cgroup_event(event))
7770 inc = true;
7771
7772 if (inc)
7778 static_key_slow_inc(&perf_sched_events.key); 7773 static_key_slow_inc(&perf_sched_events.key);
7779 7774
7780 account_event_cpu(event, event->cpu); 7775 account_event_cpu(event, event->cpu);
@@ -8422,11 +8417,11 @@ SYSCALL_DEFINE5(perf_event_open,
8422 * See perf_event_ctx_lock() for comments on the details 8417 * See perf_event_ctx_lock() for comments on the details
8423 * of swizzling perf_event::ctx. 8418 * of swizzling perf_event::ctx.
8424 */ 8419 */
8425 perf_remove_from_context(group_leader, false); 8420 perf_remove_from_context(group_leader, 0);
8426 8421
8427 list_for_each_entry(sibling, &group_leader->sibling_list, 8422 list_for_each_entry(sibling, &group_leader->sibling_list,
8428 group_entry) { 8423 group_entry) {
8429 perf_remove_from_context(sibling, false); 8424 perf_remove_from_context(sibling, 0);
8430 put_ctx(gctx); 8425 put_ctx(gctx);
8431 } 8426 }
8432 8427
@@ -8479,6 +8474,8 @@ SYSCALL_DEFINE5(perf_event_open,
8479 perf_event__header_size(event); 8474 perf_event__header_size(event);
8480 perf_event__id_header_size(event); 8475 perf_event__id_header_size(event);
8481 8476
8477 event->owner = current;
8478
8482 perf_install_in_context(ctx, event, event->cpu); 8479 perf_install_in_context(ctx, event, event->cpu);
8483 perf_unpin_context(ctx); 8480 perf_unpin_context(ctx);
8484 8481
@@ -8488,8 +8485,6 @@ SYSCALL_DEFINE5(perf_event_open,
8488 8485
8489 put_online_cpus(); 8486 put_online_cpus();
8490 8487
8491 event->owner = current;
8492
8493 mutex_lock(&current->perf_event_mutex); 8488 mutex_lock(&current->perf_event_mutex);
8494 list_add_tail(&event->owner_entry, &current->perf_event_list); 8489 list_add_tail(&event->owner_entry, &current->perf_event_list);
8495 mutex_unlock(&current->perf_event_mutex); 8490 mutex_unlock(&current->perf_event_mutex);
@@ -8556,7 +8551,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8556 } 8551 }
8557 8552
8558 /* Mark owner so we could distinguish it from user events. */ 8553 /* Mark owner so we could distinguish it from user events. */
8559 event->owner = EVENT_OWNER_KERNEL; 8554 event->owner = TASK_TOMBSTONE;
8560 8555
8561 account_event(event); 8556 account_event(event);
8562 8557
@@ -8606,7 +8601,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8606 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 8601 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8607 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 8602 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8608 event_entry) { 8603 event_entry) {
8609 perf_remove_from_context(event, false); 8604 perf_remove_from_context(event, 0);
8610 unaccount_event_cpu(event, src_cpu); 8605 unaccount_event_cpu(event, src_cpu);
8611 put_ctx(src_ctx); 8606 put_ctx(src_ctx);
8612 list_add(&event->migrate_entry, &events); 8607 list_add(&event->migrate_entry, &events);
@@ -8673,33 +8668,15 @@ static void sync_child_event(struct perf_event *child_event,
8673 &parent_event->child_total_time_enabled); 8668 &parent_event->child_total_time_enabled);
8674 atomic64_add(child_event->total_time_running, 8669 atomic64_add(child_event->total_time_running,
8675 &parent_event->child_total_time_running); 8670 &parent_event->child_total_time_running);
8676
8677 /*
8678 * Remove this event from the parent's list
8679 */
8680 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8681 mutex_lock(&parent_event->child_mutex);
8682 list_del_init(&child_event->child_list);
8683 mutex_unlock(&parent_event->child_mutex);
8684
8685 /*
8686 * Make sure user/parent get notified, that we just
8687 * lost one event.
8688 */
8689 perf_event_wakeup(parent_event);
8690
8691 /*
8692 * Release the parent event, if this was the last
8693 * reference to it.
8694 */
8695 put_event(parent_event);
8696} 8671}
8697 8672
8698static void 8673static void
8699__perf_event_exit_task(struct perf_event *child_event, 8674perf_event_exit_event(struct perf_event *child_event,
8700 struct perf_event_context *child_ctx, 8675 struct perf_event_context *child_ctx,
8701 struct task_struct *child) 8676 struct task_struct *child)
8702{ 8677{
8678 struct perf_event *parent_event = child_event->parent;
8679
8703 /* 8680 /*
8704 * Do not destroy the 'original' grouping; because of the context 8681 * Do not destroy the 'original' grouping; because of the context
8705 * switch optimization the original events could've ended up in a 8682 * switch optimization the original events could've ended up in a
@@ -8712,57 +8689,86 @@ __perf_event_exit_task(struct perf_event *child_event,
8712 * Do destroy all inherited groups, we don't care about those 8689 * Do destroy all inherited groups, we don't care about those
8713 * and being thorough is better. 8690 * and being thorough is better.
8714 */ 8691 */
8715 perf_remove_from_context(child_event, !!child_event->parent); 8692 raw_spin_lock_irq(&child_ctx->lock);
8693 WARN_ON_ONCE(child_ctx->is_active);
8694
8695 if (parent_event)
8696 perf_group_detach(child_event);
8697 list_del_event(child_event, child_ctx);
8698 child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
8699 raw_spin_unlock_irq(&child_ctx->lock);
8716 8700
8717 /* 8701 /*
8718 * It can happen that the parent exits first, and has events 8702 * Parent events are governed by their filedesc, retain them.
8719 * that are still around due to the child reference. These
8720 * events need to be zapped.
8721 */ 8703 */
8722 if (child_event->parent) { 8704 if (!parent_event) {
8723 sync_child_event(child_event, child);
8724 free_event(child_event);
8725 } else {
8726 child_event->state = PERF_EVENT_STATE_EXIT;
8727 perf_event_wakeup(child_event); 8705 perf_event_wakeup(child_event);
8706 return;
8728 } 8707 }
8708 /*
8709 * Child events can be cleaned up.
8710 */
8711
8712 sync_child_event(child_event, child);
8713
8714 /*
8715 * Remove this event from the parent's list
8716 */
8717 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8718 mutex_lock(&parent_event->child_mutex);
8719 list_del_init(&child_event->child_list);
8720 mutex_unlock(&parent_event->child_mutex);
8721
8722 /*
8723 * Kick perf_poll() for is_event_hup().
8724 */
8725 perf_event_wakeup(parent_event);
8726 free_event(child_event);
8727 put_event(parent_event);
8729} 8728}
8730 8729
8731static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 8730static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8732{ 8731{
8733 struct perf_event *child_event, *next;
8734 struct perf_event_context *child_ctx, *clone_ctx = NULL; 8732 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8735 unsigned long flags; 8733 struct perf_event *child_event, *next;
8734
8735 WARN_ON_ONCE(child != current);
8736 8736
8737 if (likely(!child->perf_event_ctxp[ctxn])) 8737 child_ctx = perf_pin_task_context(child, ctxn);
8738 if (!child_ctx)
8738 return; 8739 return;
8739 8740
8740 local_irq_save(flags);
8741 /* 8741 /*
8742 * We can't reschedule here because interrupts are disabled, 8742 * In order to reduce the amount of tricky in ctx tear-down, we hold
8743 * and either child is current or it is a task that can't be 8743 * ctx::mutex over the entire thing. This serializes against almost
8744 * scheduled, so we are now safe from rescheduling changing 8744 * everything that wants to access the ctx.
8745 * our context. 8745 *
8746 * The exception is sys_perf_event_open() /
8747 * perf_event_create_kernel_count() which does find_get_context()
8748 * without ctx::mutex (it cannot because of the move_group double mutex
8749 * lock thing). See the comments in perf_install_in_context().
8746 */ 8750 */
8747 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 8751 mutex_lock(&child_ctx->mutex);
8748 8752
8749 /* 8753 /*
8750 * Take the context lock here so that if find_get_context is 8754 * In a single ctx::lock section, de-schedule the events and detach the
8751 * reading child->perf_event_ctxp, we wait until it has 8755 * context from the task such that we cannot ever get it scheduled back
8752 * incremented the context's refcount before we do put_ctx below. 8756 * in.
8753 */ 8757 */
8754 raw_spin_lock(&child_ctx->lock); 8758 raw_spin_lock_irq(&child_ctx->lock);
8755 task_ctx_sched_out(child_ctx); 8759 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
8756 child->perf_event_ctxp[ctxn] = NULL;
8757 8760
8758 /* 8761 /*
8759 * If this context is a clone; unclone it so it can't get 8762 * Now that the context is inactive, destroy the task <-> ctx relation
8760 * swapped to another process while we're removing all 8763 * and mark the context dead.
8761 * the events from it.
8762 */ 8764 */
8765 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
8766 put_ctx(child_ctx); /* cannot be last */
8767 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
8768 put_task_struct(current); /* cannot be last */
8769
8763 clone_ctx = unclone_ctx(child_ctx); 8770 clone_ctx = unclone_ctx(child_ctx);
8764 update_context_time(child_ctx); 8771 raw_spin_unlock_irq(&child_ctx->lock);
8765 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8766 8772
8767 if (clone_ctx) 8773 if (clone_ctx)
8768 put_ctx(clone_ctx); 8774 put_ctx(clone_ctx);
@@ -8774,20 +8780,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8774 */ 8780 */
8775 perf_event_task(child, child_ctx, 0); 8781 perf_event_task(child, child_ctx, 0);
8776 8782
8777 /*
8778 * We can recurse on the same lock type through:
8779 *
8780 * __perf_event_exit_task()
8781 * sync_child_event()
8782 * put_event()
8783 * mutex_lock(&ctx->mutex)
8784 *
8785 * But since its the parent context it won't be the same instance.
8786 */
8787 mutex_lock(&child_ctx->mutex);
8788
8789 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 8783 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8790 __perf_event_exit_task(child_event, child_ctx, child); 8784 perf_event_exit_event(child_event, child_ctx, child);
8791 8785
8792 mutex_unlock(&child_ctx->mutex); 8786 mutex_unlock(&child_ctx->mutex);
8793 8787
@@ -8812,8 +8806,7 @@ void perf_event_exit_task(struct task_struct *child)
8812 * the owner, closes a race against perf_release() where 8806 * the owner, closes a race against perf_release() where
8813 * we need to serialize on the owner->perf_event_mutex. 8807 * we need to serialize on the owner->perf_event_mutex.
8814 */ 8808 */
8815 smp_wmb(); 8809 smp_store_release(&event->owner, NULL);
8816 event->owner = NULL;
8817 } 8810 }
8818 mutex_unlock(&child->perf_event_mutex); 8811 mutex_unlock(&child->perf_event_mutex);
8819 8812
@@ -8896,21 +8889,20 @@ void perf_event_delayed_put(struct task_struct *task)
8896 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 8889 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8897} 8890}
8898 8891
8899struct perf_event *perf_event_get(unsigned int fd) 8892struct file *perf_event_get(unsigned int fd)
8900{ 8893{
8901 int err; 8894 struct file *file;
8902 struct fd f;
8903 struct perf_event *event;
8904 8895
8905 err = perf_fget_light(fd, &f); 8896 file = fget_raw(fd);
8906 if (err) 8897 if (!file)
8907 return ERR_PTR(err); 8898 return ERR_PTR(-EBADF);
8908 8899
8909 event = f.file->private_data; 8900 if (file->f_op != &perf_fops) {
8910 atomic_long_inc(&event->refcount); 8901 fput(file);
8911 fdput(f); 8902 return ERR_PTR(-EBADF);
8903 }
8912 8904
8913 return event; 8905 return file;
8914} 8906}
8915 8907
8916const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 8908const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -8953,8 +8945,16 @@ inherit_event(struct perf_event *parent_event,
8953 if (IS_ERR(child_event)) 8945 if (IS_ERR(child_event))
8954 return child_event; 8946 return child_event;
8955 8947
8948 /*
8949 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
8950 * must be under the same lock in order to serialize against
8951 * perf_event_release_kernel(), such that either we must observe
8952 * is_orphaned_event() or they will observe us on the child_list.
8953 */
8954 mutex_lock(&parent_event->child_mutex);
8956 if (is_orphaned_event(parent_event) || 8955 if (is_orphaned_event(parent_event) ||
8957 !atomic_long_inc_not_zero(&parent_event->refcount)) { 8956 !atomic_long_inc_not_zero(&parent_event->refcount)) {
8957 mutex_unlock(&parent_event->child_mutex);
8958 free_event(child_event); 8958 free_event(child_event);
8959 return NULL; 8959 return NULL;
8960 } 8960 }
@@ -9002,8 +9002,6 @@ inherit_event(struct perf_event *parent_event,
9002 /* 9002 /*
9003 * Link this into the parent event's child list 9003 * Link this into the parent event's child list
9004 */ 9004 */
9005 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
9006 mutex_lock(&parent_event->child_mutex);
9007 list_add_tail(&child_event->child_list, &parent_event->child_list); 9005 list_add_tail(&child_event->child_list, &parent_event->child_list);
9008 mutex_unlock(&parent_event->child_mutex); 9006 mutex_unlock(&parent_event->child_mutex);
9009 9007
@@ -9221,13 +9219,14 @@ static void perf_event_init_cpu(int cpu)
9221#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 9219#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
9222static void __perf_event_exit_context(void *__info) 9220static void __perf_event_exit_context(void *__info)
9223{ 9221{
9224 struct remove_event re = { .detach_group = true };
9225 struct perf_event_context *ctx = __info; 9222 struct perf_event_context *ctx = __info;
9223 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
9224 struct perf_event *event;
9226 9225
9227 rcu_read_lock(); 9226 raw_spin_lock(&ctx->lock);
9228 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 9227 list_for_each_entry(event, &ctx->event_list, event_entry)
9229 __perf_remove_from_context(&re); 9228 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
9230 rcu_read_unlock(); 9229 raw_spin_unlock(&ctx->lock);
9231} 9230}
9232 9231
9233static void perf_event_exit_cpu_context(int cpu) 9232static void perf_event_exit_cpu_context(int cpu)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..3f8cb1e14588 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
444 * current task. 444 * current task.
445 */ 445 */
446 if (irqs_disabled() && bp->ctx && bp->ctx->task == current) 446 if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
447 __perf_event_disable(bp); 447 perf_event_disable_local(bp);
448 else 448 else
449 perf_event_disable(bp); 449 perf_event_disable(bp);
450 450
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536117..1faad2cfdb9e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
459 __free_page(page); 459 __free_page(page);
460} 460}
461 461
462static void __rb_free_aux(struct ring_buffer *rb)
463{
464 int pg;
465
466 if (rb->aux_priv) {
467 rb->free_aux(rb->aux_priv);
468 rb->free_aux = NULL;
469 rb->aux_priv = NULL;
470 }
471
472 if (rb->aux_nr_pages) {
473 for (pg = 0; pg < rb->aux_nr_pages; pg++)
474 rb_free_aux_page(rb, pg);
475
476 kfree(rb->aux_pages);
477 rb->aux_nr_pages = 0;
478 }
479}
480
462int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, 481int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
463 pgoff_t pgoff, int nr_pages, long watermark, int flags) 482 pgoff_t pgoff, int nr_pages, long watermark, int flags)
464{ 483{
@@ -547,30 +566,11 @@ out:
547 if (!ret) 566 if (!ret)
548 rb->aux_pgoff = pgoff; 567 rb->aux_pgoff = pgoff;
549 else 568 else
550 rb_free_aux(rb); 569 __rb_free_aux(rb);
551 570
552 return ret; 571 return ret;
553} 572}
554 573
555static void __rb_free_aux(struct ring_buffer *rb)
556{
557 int pg;
558
559 if (rb->aux_priv) {
560 rb->free_aux(rb->aux_priv);
561 rb->free_aux = NULL;
562 rb->aux_priv = NULL;
563 }
564
565 if (rb->aux_nr_pages) {
566 for (pg = 0; pg < rb->aux_nr_pages; pg++)
567 rb_free_aux_page(rb, pg);
568
569 kfree(rb->aux_pages);
570 rb->aux_nr_pages = 0;
571 }
572}
573
574void rb_free_aux(struct ring_buffer *rb) 574void rb_free_aux(struct ring_buffer *rb)
575{ 575{
576 if (atomic_dec_and_test(&rb->aux_refcount)) 576 if (atomic_dec_and_test(&rb->aux_refcount))
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 45dd798bcd37..326a75e884db 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
191 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; 191 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
192 struct bpf_array *array = container_of(map, struct bpf_array, map); 192 struct bpf_array *array = container_of(map, struct bpf_array, map);
193 struct perf_event *event; 193 struct perf_event *event;
194 struct file *file;
194 195
195 if (unlikely(index >= array->map.max_entries)) 196 if (unlikely(index >= array->map.max_entries))
196 return -E2BIG; 197 return -E2BIG;
197 198
198 event = (struct perf_event *)array->ptrs[index]; 199 file = (struct file *)array->ptrs[index];
199 if (!event) 200 if (unlikely(!file))
200 return -ENOENT; 201 return -ENOENT;
201 202
203 event = file->private_data;
204
202 /* make sure event is local and doesn't have pmu::count */ 205 /* make sure event is local and doesn't have pmu::count */
203 if (event->oncpu != smp_processor_id() || 206 if (event->oncpu != smp_processor_id() ||
204 event->pmu->count) 207 event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
228 void *data = (void *) (long) r4; 231 void *data = (void *) (long) r4;
229 struct perf_sample_data sample_data; 232 struct perf_sample_data sample_data;
230 struct perf_event *event; 233 struct perf_event *event;
234 struct file *file;
231 struct perf_raw_record raw = { 235 struct perf_raw_record raw = {
232 .size = size, 236 .size = size,
233 .data = data, 237 .data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
236 if (unlikely(index >= array->map.max_entries)) 240 if (unlikely(index >= array->map.max_entries))
237 return -E2BIG; 241 return -E2BIG;
238 242
239 event = (struct perf_event *)array->ptrs[index]; 243 file = (struct file *)array->ptrs[index];
240 if (unlikely(!event)) 244 if (unlikely(!file))
241 return -ENOENT; 245 return -ENOENT;
242 246
247 event = file->private_data;
248
243 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || 249 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
244 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) 250 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
245 return -EINVAL; 251 return -EINVAL;