aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2016-01-14 10:05:37 -0500
committerIngo Molnar <mingo@kernel.org>2016-01-21 12:54:25 -0500
commit63b6da39bb38e8f1a1ef3180d32a39d6baf9da84 (patch)
tree5ded15d43220c36564a2312310e310dca62ca2c6
parentc97f473643a9d3e618c0f0426b926bc3a3e23944 (diff)
perf: Fix perf_event_exit_task() race
There is a race against perf_event_exit_task() vs event_function_call(),find_get_context(),perf_install_in_context() (iow, everyone). Since there is no permanent marker on a context that its dead, it is quite possible that we access (and even modify) a context after its passed through perf_event_exit_task(). For instance, find_get_context() might find the context still installed, but by the time we get to perf_install_in_context() it might already have passed through perf_event_exit_task() and be considered dead, we will however still add the event to it. Solve this by marking a ctx dead by setting its ctx->task value to -1, it must be !0 so we still know its a (former) task context. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vince Weaver <vincent.weaver@maine.edu> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/events/core.c151
1 files changed, 85 insertions, 66 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3eaf91b104e9..9de4d352ba8c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
148 raw_spin_unlock(&cpuctx->ctx.lock); 148 raw_spin_unlock(&cpuctx->ctx.lock);
149} 149}
150 150
151#define TASK_TOMBSTONE ((void *)-1L)
152
153static bool is_kernel_event(struct perf_event *event)
154{
155 return event->owner == TASK_TOMBSTONE;
156}
157
151/* 158/*
152 * On task ctx scheduling... 159 * On task ctx scheduling...
153 * 160 *
@@ -196,31 +203,21 @@ static int event_function(void *info)
196 struct perf_event_context *ctx = event->ctx; 203 struct perf_event_context *ctx = event->ctx;
197 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 204 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
198 struct perf_event_context *task_ctx = cpuctx->task_ctx; 205 struct perf_event_context *task_ctx = cpuctx->task_ctx;
206 int ret = 0;
199 207
200 WARN_ON_ONCE(!irqs_disabled()); 208 WARN_ON_ONCE(!irqs_disabled());
201 209
210 perf_ctx_lock(cpuctx, task_ctx);
202 /* 211 /*
203 * Since we do the IPI call without holding ctx->lock things can have 212 * Since we do the IPI call without holding ctx->lock things can have
204 * changed, double check we hit the task we set out to hit. 213 * changed, double check we hit the task we set out to hit.
205 *
206 * If ctx->task == current, we know things must remain valid because
207 * we have IRQs disabled so we cannot schedule.
208 */ 214 */
209 if (ctx->task) { 215 if (ctx->task) {
210 if (ctx->task != current) 216 if (ctx->task != current) {
211 return -EAGAIN; 217 ret = -EAGAIN;
212 218 goto unlock;
213 WARN_ON_ONCE(task_ctx != ctx); 219 }
214 } else {
215 WARN_ON_ONCE(&cpuctx->ctx != ctx);
216 }
217 220
218 perf_ctx_lock(cpuctx, task_ctx);
219 /*
220 * Now that we hold locks, double check state. Paranoia pays.
221 */
222 if (task_ctx) {
223 WARN_ON_ONCE(task_ctx->task != current);
224 /* 221 /*
225 * We only use event_function_call() on established contexts, 222 * We only use event_function_call() on established contexts,
226 * and event_function() is only ever called when active (or 223 * and event_function() is only ever called when active (or
@@ -233,12 +230,16 @@ static int event_function(void *info)
233 * And since we have ctx->is_active, cpuctx->task_ctx must 230 * And since we have ctx->is_active, cpuctx->task_ctx must
234 * match. 231 * match.
235 */ 232 */
236 WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); 233 WARN_ON_ONCE(task_ctx != ctx);
234 } else {
235 WARN_ON_ONCE(&cpuctx->ctx != ctx);
237 } 236 }
237
238 efs->func(event, cpuctx, ctx, efs->data); 238 efs->func(event, cpuctx, ctx, efs->data);
239unlock:
239 perf_ctx_unlock(cpuctx, task_ctx); 240 perf_ctx_unlock(cpuctx, task_ctx);
240 241
241 return 0; 242 return ret;
242} 243}
243 244
244static void event_function_local(struct perf_event *event, event_f func, void *data) 245static void event_function_local(struct perf_event *event, event_f func, void *data)
@@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
256static void event_function_call(struct perf_event *event, event_f func, void *data) 257static void event_function_call(struct perf_event *event, event_f func, void *data)
257{ 258{
258 struct perf_event_context *ctx = event->ctx; 259 struct perf_event_context *ctx = event->ctx;
259 struct task_struct *task = ctx->task; 260 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
260 struct event_function_struct efs = { 261 struct event_function_struct efs = {
261 .event = event, 262 .event = event,
262 .func = func, 263 .func = func,
@@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
278 } 279 }
279 280
280again: 281again:
282 if (task == TASK_TOMBSTONE)
283 return;
284
281 if (!task_function_call(task, event_function, &efs)) 285 if (!task_function_call(task, event_function, &efs))
282 return; 286 return;
283 287
284 raw_spin_lock_irq(&ctx->lock); 288 raw_spin_lock_irq(&ctx->lock);
285 if (ctx->is_active) { 289 /*
286 /* 290 * Reload the task pointer, it might have been changed by
287 * Reload the task pointer, it might have been changed by 291 * a concurrent perf_event_context_sched_out().
288 * a concurrent perf_event_context_sched_out(). 292 */
289 */ 293 task = ctx->task;
290 task = ctx->task; 294 if (task != TASK_TOMBSTONE) {
291 raw_spin_unlock_irq(&ctx->lock); 295 if (ctx->is_active) {
292 goto again; 296 raw_spin_unlock_irq(&ctx->lock);
297 goto again;
298 }
299 func(event, NULL, ctx, data);
293 } 300 }
294 func(event, NULL, ctx, data);
295 raw_spin_unlock_irq(&ctx->lock); 301 raw_spin_unlock_irq(&ctx->lock);
296} 302}
297 303
298#define EVENT_OWNER_KERNEL ((void *) -1)
299
300static bool is_kernel_event(struct perf_event *event)
301{
302 return event->owner == EVENT_OWNER_KERNEL;
303}
304
305#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 304#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
306 PERF_FLAG_FD_OUTPUT |\ 305 PERF_FLAG_FD_OUTPUT |\
307 PERF_FLAG_PID_CGROUP |\ 306 PERF_FLAG_PID_CGROUP |\
@@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx)
1025 if (atomic_dec_and_test(&ctx->refcount)) { 1024 if (atomic_dec_and_test(&ctx->refcount)) {
1026 if (ctx->parent_ctx) 1025 if (ctx->parent_ctx)
1027 put_ctx(ctx->parent_ctx); 1026 put_ctx(ctx->parent_ctx);
1028 if (ctx->task) 1027 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1029 put_task_struct(ctx->task); 1028 put_task_struct(ctx->task);
1030 call_rcu(&ctx->rcu_head, free_ctx); 1029 call_rcu(&ctx->rcu_head, free_ctx);
1031 } 1030 }
@@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event)
1186 1185
1187/* 1186/*
1188 * Get the perf_event_context for a task and lock it. 1187 * Get the perf_event_context for a task and lock it.
1188 *
1189 * This has to cope with with the fact that until it is locked, 1189 * This has to cope with with the fact that until it is locked,
1190 * the context could get moved to another task. 1190 * the context could get moved to another task.
1191 */ 1191 */
@@ -1226,10 +1226,13 @@ retry:
1226 goto retry; 1226 goto retry;
1227 } 1227 }
1228 1228
1229 if (!atomic_inc_not_zero(&ctx->refcount)) { 1229 if (ctx->task == TASK_TOMBSTONE ||
1230 !atomic_inc_not_zero(&ctx->refcount)) {
1230 raw_spin_unlock(&ctx->lock); 1231 raw_spin_unlock(&ctx->lock);
1231 ctx = NULL; 1232 ctx = NULL;
1232 } 1233 }
1234
1235 WARN_ON_ONCE(ctx->task != task);
1233 } 1236 }
1234 rcu_read_unlock(); 1237 rcu_read_unlock();
1235 if (!ctx) 1238 if (!ctx)
@@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info)
2140 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2143 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2141 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2144 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2142 2145
2146 raw_spin_lock(&cpuctx->ctx.lock);
2143 if (ctx->task) { 2147 if (ctx->task) {
2148 raw_spin_lock(&ctx->lock);
2144 /* 2149 /*
2145 * If we hit the 'wrong' task, we've since scheduled and 2150 * If we hit the 'wrong' task, we've since scheduled and
2146 * everything should be sorted, nothing to do! 2151 * everything should be sorted, nothing to do!
2147 */ 2152 */
2153 task_ctx = ctx;
2148 if (ctx->task != current) 2154 if (ctx->task != current)
2149 return 0; 2155 goto unlock;
2150 2156
2151 /* 2157 /*
2152 * If task_ctx is set, it had better be to us. 2158 * If task_ctx is set, it had better be to us.
2153 */ 2159 */
2154 WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2160 WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
2155 task_ctx = ctx; 2161 } else if (task_ctx) {
2162 raw_spin_lock(&task_ctx->lock);
2156 } 2163 }
2157 2164
2158 perf_ctx_lock(cpuctx, task_ctx);
2159 ctx_resched(cpuctx, task_ctx); 2165 ctx_resched(cpuctx, task_ctx);
2166unlock:
2160 perf_ctx_unlock(cpuctx, task_ctx); 2167 perf_ctx_unlock(cpuctx, task_ctx);
2161 2168
2162 return 0; 2169 return 0;
@@ -2188,6 +2195,17 @@ perf_install_in_context(struct perf_event_context *ctx,
2188 * happened and that will have taken care of business. 2195 * happened and that will have taken care of business.
2189 */ 2196 */
2190 raw_spin_lock_irq(&ctx->lock); 2197 raw_spin_lock_irq(&ctx->lock);
2198 task = ctx->task;
2199 /*
2200 * Worse, we cannot even rely on the ctx actually existing anymore. If
2201 * between find_get_context() and perf_install_in_context() the task
2202 * went through perf_event_exit_task() its dead and we should not be
2203 * adding new events.
2204 */
2205 if (task == TASK_TOMBSTONE) {
2206 raw_spin_unlock_irq(&ctx->lock);
2207 return;
2208 }
2191 update_context_time(ctx); 2209 update_context_time(ctx);
2192 /* 2210 /*
2193 * Update cgrp time only if current cgrp matches event->cgrp. 2211 * Update cgrp time only if current cgrp matches event->cgrp.
@@ -2195,7 +2213,6 @@ perf_install_in_context(struct perf_event_context *ctx,
2195 */ 2213 */
2196 update_cgrp_time_from_event(event); 2214 update_cgrp_time_from_event(event);
2197 add_event_to_ctx(event, ctx); 2215 add_event_to_ctx(event, ctx);
2198 task = ctx->task;
2199 raw_spin_unlock_irq(&ctx->lock); 2216 raw_spin_unlock_irq(&ctx->lock);
2200 2217
2201 if (task) 2218 if (task)
@@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2538 raw_spin_lock(&ctx->lock); 2555 raw_spin_lock(&ctx->lock);
2539 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2556 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2540 if (context_equiv(ctx, next_ctx)) { 2557 if (context_equiv(ctx, next_ctx)) {
2541 /* 2558 WRITE_ONCE(ctx->task, next);
2542 * XXX do we need a memory barrier of sorts 2559 WRITE_ONCE(next_ctx->task, task);
2543 * wrt to rcu_dereference() of perf_event_ctxp
2544 */
2545 task->perf_event_ctxp[ctxn] = next_ctx;
2546 next->perf_event_ctxp[ctxn] = ctx;
2547 ctx->task = next;
2548 next_ctx->task = task;
2549 2560
2550 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2561 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2551 2562
2563 /*
2564 * RCU_INIT_POINTER here is safe because we've not
2565 * modified the ctx and the above modification of
2566 * ctx->task and ctx->task_ctx_data are immaterial
2567 * since those values are always verified under
2568 * ctx->lock which we're now holding.
2569 */
2570 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2571 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2572
2552 do_switch = 0; 2573 do_switch = 0;
2553 2574
2554 perf_event_sync_stat(ctx, next_ctx); 2575 perf_event_sync_stat(ctx, next_ctx);
@@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8545 } 8566 }
8546 8567
8547 /* Mark owner so we could distinguish it from user events. */ 8568 /* Mark owner so we could distinguish it from user events. */
8548 event->owner = EVENT_OWNER_KERNEL; 8569 event->owner = TASK_TOMBSTONE;
8549 8570
8550 account_event(event); 8571 account_event(event);
8551 8572
@@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event,
8725 8746
8726static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 8747static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8727{ 8748{
8728 struct perf_event *child_event, *next;
8729 struct perf_event_context *child_ctx, *clone_ctx = NULL; 8749 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8750 struct perf_event *child_event, *next;
8751 unsigned long flags;
8730 8752
8731 if (likely(!child->perf_event_ctxp[ctxn])) 8753 WARN_ON_ONCE(child != current);
8754
8755 child_ctx = perf_lock_task_context(child, ctxn, &flags);
8756 if (!child_ctx)
8732 return; 8757 return;
8733 8758
8734 local_irq_disable(); 8759 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
8735 WARN_ON_ONCE(child != current);
8736 /*
8737 * We can't reschedule here because interrupts are disabled,
8738 * and child must be current.
8739 */
8740 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8741 8760
8742 /* 8761 /*
8743 * Take the context lock here so that if find_get_context is 8762 * Now that the context is inactive, destroy the task <-> ctx relation
8744 * reading child->perf_event_ctxp, we wait until it has 8763 * and mark the context dead.
8745 * incremented the context's refcount before we do put_ctx below.
8746 */ 8764 */
8747 raw_spin_lock(&child_ctx->lock); 8765 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
8748 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); 8766 put_ctx(child_ctx); /* cannot be last */
8749 child->perf_event_ctxp[ctxn] = NULL; 8767 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
8768 put_task_struct(current); /* cannot be last */
8750 8769
8751 /* 8770 /*
8752 * If this context is a clone; unclone it so it can't get 8771 * If this context is a clone; unclone it so it can't get
@@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8755 */ 8774 */
8756 clone_ctx = unclone_ctx(child_ctx); 8775 clone_ctx = unclone_ctx(child_ctx);
8757 update_context_time(child_ctx); 8776 update_context_time(child_ctx);
8758 raw_spin_unlock_irq(&child_ctx->lock); 8777 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8759 8778
8760 if (clone_ctx) 8779 if (clone_ctx)
8761 put_ctx(clone_ctx); 8780 put_ctx(clone_ctx);