diff options
author | Peter Zijlstra <peterz@infradead.org> | 2016-01-14 10:05:37 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-01-21 12:54:25 -0500 |
commit | 63b6da39bb38e8f1a1ef3180d32a39d6baf9da84 (patch) | |
tree | 5ded15d43220c36564a2312310e310dca62ca2c6 | |
parent | c97f473643a9d3e618c0f0426b926bc3a3e23944 (diff) |
perf: Fix perf_event_exit_task() race
There is a race against perf_event_exit_task() vs
event_function_call(),find_get_context(),perf_install_in_context()
(iow, everyone).
Since there is no permanent marker on a context that its dead, it is
quite possible that we access (and even modify) a context after its
passed through perf_event_exit_task().
For instance, find_get_context() might find the context still
installed, but by the time we get to perf_install_in_context() it
might already have passed through perf_event_exit_task() and be
considered dead, we will however still add the event to it.
Solve this by marking a ctx dead by setting its ctx->task value to -1,
it must be !0 so we still know its a (former) task context.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | kernel/events/core.c | 151 |
1 files changed, 85 insertions, 66 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 3eaf91b104e9..9de4d352ba8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | |||
148 | raw_spin_unlock(&cpuctx->ctx.lock); | 148 | raw_spin_unlock(&cpuctx->ctx.lock); |
149 | } | 149 | } |
150 | 150 | ||
151 | #define TASK_TOMBSTONE ((void *)-1L) | ||
152 | |||
153 | static bool is_kernel_event(struct perf_event *event) | ||
154 | { | ||
155 | return event->owner == TASK_TOMBSTONE; | ||
156 | } | ||
157 | |||
151 | /* | 158 | /* |
152 | * On task ctx scheduling... | 159 | * On task ctx scheduling... |
153 | * | 160 | * |
@@ -196,31 +203,21 @@ static int event_function(void *info) | |||
196 | struct perf_event_context *ctx = event->ctx; | 203 | struct perf_event_context *ctx = event->ctx; |
197 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 204 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
198 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | 205 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
206 | int ret = 0; | ||
199 | 207 | ||
200 | WARN_ON_ONCE(!irqs_disabled()); | 208 | WARN_ON_ONCE(!irqs_disabled()); |
201 | 209 | ||
210 | perf_ctx_lock(cpuctx, task_ctx); | ||
202 | /* | 211 | /* |
203 | * Since we do the IPI call without holding ctx->lock things can have | 212 | * Since we do the IPI call without holding ctx->lock things can have |
204 | * changed, double check we hit the task we set out to hit. | 213 | * changed, double check we hit the task we set out to hit. |
205 | * | ||
206 | * If ctx->task == current, we know things must remain valid because | ||
207 | * we have IRQs disabled so we cannot schedule. | ||
208 | */ | 214 | */ |
209 | if (ctx->task) { | 215 | if (ctx->task) { |
210 | if (ctx->task != current) | 216 | if (ctx->task != current) { |
211 | return -EAGAIN; | 217 | ret = -EAGAIN; |
212 | 218 | goto unlock; | |
213 | WARN_ON_ONCE(task_ctx != ctx); | 219 | } |
214 | } else { | ||
215 | WARN_ON_ONCE(&cpuctx->ctx != ctx); | ||
216 | } | ||
217 | 220 | ||
218 | perf_ctx_lock(cpuctx, task_ctx); | ||
219 | /* | ||
220 | * Now that we hold locks, double check state. Paranoia pays. | ||
221 | */ | ||
222 | if (task_ctx) { | ||
223 | WARN_ON_ONCE(task_ctx->task != current); | ||
224 | /* | 221 | /* |
225 | * We only use event_function_call() on established contexts, | 222 | * We only use event_function_call() on established contexts, |
226 | * and event_function() is only ever called when active (or | 223 | * and event_function() is only ever called when active (or |
@@ -233,12 +230,16 @@ static int event_function(void *info) | |||
233 | * And since we have ctx->is_active, cpuctx->task_ctx must | 230 | * And since we have ctx->is_active, cpuctx->task_ctx must |
234 | * match. | 231 | * match. |
235 | */ | 232 | */ |
236 | WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); | 233 | WARN_ON_ONCE(task_ctx != ctx); |
234 | } else { | ||
235 | WARN_ON_ONCE(&cpuctx->ctx != ctx); | ||
237 | } | 236 | } |
237 | |||
238 | efs->func(event, cpuctx, ctx, efs->data); | 238 | efs->func(event, cpuctx, ctx, efs->data); |
239 | unlock: | ||
239 | perf_ctx_unlock(cpuctx, task_ctx); | 240 | perf_ctx_unlock(cpuctx, task_ctx); |
240 | 241 | ||
241 | return 0; | 242 | return ret; |
242 | } | 243 | } |
243 | 244 | ||
244 | static void event_function_local(struct perf_event *event, event_f func, void *data) | 245 | static void event_function_local(struct perf_event *event, event_f func, void *data) |
@@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d | |||
256 | static void event_function_call(struct perf_event *event, event_f func, void *data) | 257 | static void event_function_call(struct perf_event *event, event_f func, void *data) |
257 | { | 258 | { |
258 | struct perf_event_context *ctx = event->ctx; | 259 | struct perf_event_context *ctx = event->ctx; |
259 | struct task_struct *task = ctx->task; | 260 | struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ |
260 | struct event_function_struct efs = { | 261 | struct event_function_struct efs = { |
261 | .event = event, | 262 | .event = event, |
262 | .func = func, | 263 | .func = func, |
@@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da | |||
278 | } | 279 | } |
279 | 280 | ||
280 | again: | 281 | again: |
282 | if (task == TASK_TOMBSTONE) | ||
283 | return; | ||
284 | |||
281 | if (!task_function_call(task, event_function, &efs)) | 285 | if (!task_function_call(task, event_function, &efs)) |
282 | return; | 286 | return; |
283 | 287 | ||
284 | raw_spin_lock_irq(&ctx->lock); | 288 | raw_spin_lock_irq(&ctx->lock); |
285 | if (ctx->is_active) { | 289 | /* |
286 | /* | 290 | * Reload the task pointer, it might have been changed by |
287 | * Reload the task pointer, it might have been changed by | 291 | * a concurrent perf_event_context_sched_out(). |
288 | * a concurrent perf_event_context_sched_out(). | 292 | */ |
289 | */ | 293 | task = ctx->task; |
290 | task = ctx->task; | 294 | if (task != TASK_TOMBSTONE) { |
291 | raw_spin_unlock_irq(&ctx->lock); | 295 | if (ctx->is_active) { |
292 | goto again; | 296 | raw_spin_unlock_irq(&ctx->lock); |
297 | goto again; | ||
298 | } | ||
299 | func(event, NULL, ctx, data); | ||
293 | } | 300 | } |
294 | func(event, NULL, ctx, data); | ||
295 | raw_spin_unlock_irq(&ctx->lock); | 301 | raw_spin_unlock_irq(&ctx->lock); |
296 | } | 302 | } |
297 | 303 | ||
298 | #define EVENT_OWNER_KERNEL ((void *) -1) | ||
299 | |||
300 | static bool is_kernel_event(struct perf_event *event) | ||
301 | { | ||
302 | return event->owner == EVENT_OWNER_KERNEL; | ||
303 | } | ||
304 | |||
305 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 304 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
306 | PERF_FLAG_FD_OUTPUT |\ | 305 | PERF_FLAG_FD_OUTPUT |\ |
307 | PERF_FLAG_PID_CGROUP |\ | 306 | PERF_FLAG_PID_CGROUP |\ |
@@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
1025 | if (atomic_dec_and_test(&ctx->refcount)) { | 1024 | if (atomic_dec_and_test(&ctx->refcount)) { |
1026 | if (ctx->parent_ctx) | 1025 | if (ctx->parent_ctx) |
1027 | put_ctx(ctx->parent_ctx); | 1026 | put_ctx(ctx->parent_ctx); |
1028 | if (ctx->task) | 1027 | if (ctx->task && ctx->task != TASK_TOMBSTONE) |
1029 | put_task_struct(ctx->task); | 1028 | put_task_struct(ctx->task); |
1030 | call_rcu(&ctx->rcu_head, free_ctx); | 1029 | call_rcu(&ctx->rcu_head, free_ctx); |
1031 | } | 1030 | } |
@@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event) | |||
1186 | 1185 | ||
1187 | /* | 1186 | /* |
1188 | * Get the perf_event_context for a task and lock it. | 1187 | * Get the perf_event_context for a task and lock it. |
1188 | * | ||
1189 | * This has to cope with with the fact that until it is locked, | 1189 | * This has to cope with with the fact that until it is locked, |
1190 | * the context could get moved to another task. | 1190 | * the context could get moved to another task. |
1191 | */ | 1191 | */ |
@@ -1226,10 +1226,13 @@ retry: | |||
1226 | goto retry; | 1226 | goto retry; |
1227 | } | 1227 | } |
1228 | 1228 | ||
1229 | if (!atomic_inc_not_zero(&ctx->refcount)) { | 1229 | if (ctx->task == TASK_TOMBSTONE || |
1230 | !atomic_inc_not_zero(&ctx->refcount)) { | ||
1230 | raw_spin_unlock(&ctx->lock); | 1231 | raw_spin_unlock(&ctx->lock); |
1231 | ctx = NULL; | 1232 | ctx = NULL; |
1232 | } | 1233 | } |
1234 | |||
1235 | WARN_ON_ONCE(ctx->task != task); | ||
1233 | } | 1236 | } |
1234 | rcu_read_unlock(); | 1237 | rcu_read_unlock(); |
1235 | if (!ctx) | 1238 | if (!ctx) |
@@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info) | |||
2140 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 2143 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
2141 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | 2144 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
2142 | 2145 | ||
2146 | raw_spin_lock(&cpuctx->ctx.lock); | ||
2143 | if (ctx->task) { | 2147 | if (ctx->task) { |
2148 | raw_spin_lock(&ctx->lock); | ||
2144 | /* | 2149 | /* |
2145 | * If we hit the 'wrong' task, we've since scheduled and | 2150 | * If we hit the 'wrong' task, we've since scheduled and |
2146 | * everything should be sorted, nothing to do! | 2151 | * everything should be sorted, nothing to do! |
2147 | */ | 2152 | */ |
2153 | task_ctx = ctx; | ||
2148 | if (ctx->task != current) | 2154 | if (ctx->task != current) |
2149 | return 0; | 2155 | goto unlock; |
2150 | 2156 | ||
2151 | /* | 2157 | /* |
2152 | * If task_ctx is set, it had better be to us. | 2158 | * If task_ctx is set, it had better be to us. |
2153 | */ | 2159 | */ |
2154 | WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); | 2160 | WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); |
2155 | task_ctx = ctx; | 2161 | } else if (task_ctx) { |
2162 | raw_spin_lock(&task_ctx->lock); | ||
2156 | } | 2163 | } |
2157 | 2164 | ||
2158 | perf_ctx_lock(cpuctx, task_ctx); | ||
2159 | ctx_resched(cpuctx, task_ctx); | 2165 | ctx_resched(cpuctx, task_ctx); |
2166 | unlock: | ||
2160 | perf_ctx_unlock(cpuctx, task_ctx); | 2167 | perf_ctx_unlock(cpuctx, task_ctx); |
2161 | 2168 | ||
2162 | return 0; | 2169 | return 0; |
@@ -2188,6 +2195,17 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
2188 | * happened and that will have taken care of business. | 2195 | * happened and that will have taken care of business. |
2189 | */ | 2196 | */ |
2190 | raw_spin_lock_irq(&ctx->lock); | 2197 | raw_spin_lock_irq(&ctx->lock); |
2198 | task = ctx->task; | ||
2199 | /* | ||
2200 | * Worse, we cannot even rely on the ctx actually existing anymore. If | ||
2201 | * between find_get_context() and perf_install_in_context() the task | ||
2202 | * went through perf_event_exit_task() its dead and we should not be | ||
2203 | * adding new events. | ||
2204 | */ | ||
2205 | if (task == TASK_TOMBSTONE) { | ||
2206 | raw_spin_unlock_irq(&ctx->lock); | ||
2207 | return; | ||
2208 | } | ||
2191 | update_context_time(ctx); | 2209 | update_context_time(ctx); |
2192 | /* | 2210 | /* |
2193 | * Update cgrp time only if current cgrp matches event->cgrp. | 2211 | * Update cgrp time only if current cgrp matches event->cgrp. |
@@ -2195,7 +2213,6 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
2195 | */ | 2213 | */ |
2196 | update_cgrp_time_from_event(event); | 2214 | update_cgrp_time_from_event(event); |
2197 | add_event_to_ctx(event, ctx); | 2215 | add_event_to_ctx(event, ctx); |
2198 | task = ctx->task; | ||
2199 | raw_spin_unlock_irq(&ctx->lock); | 2216 | raw_spin_unlock_irq(&ctx->lock); |
2200 | 2217 | ||
2201 | if (task) | 2218 | if (task) |
@@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2538 | raw_spin_lock(&ctx->lock); | 2555 | raw_spin_lock(&ctx->lock); |
2539 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); | 2556 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); |
2540 | if (context_equiv(ctx, next_ctx)) { | 2557 | if (context_equiv(ctx, next_ctx)) { |
2541 | /* | 2558 | WRITE_ONCE(ctx->task, next); |
2542 | * XXX do we need a memory barrier of sorts | 2559 | WRITE_ONCE(next_ctx->task, task); |
2543 | * wrt to rcu_dereference() of perf_event_ctxp | ||
2544 | */ | ||
2545 | task->perf_event_ctxp[ctxn] = next_ctx; | ||
2546 | next->perf_event_ctxp[ctxn] = ctx; | ||
2547 | ctx->task = next; | ||
2548 | next_ctx->task = task; | ||
2549 | 2560 | ||
2550 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); | 2561 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); |
2551 | 2562 | ||
2563 | /* | ||
2564 | * RCU_INIT_POINTER here is safe because we've not | ||
2565 | * modified the ctx and the above modification of | ||
2566 | * ctx->task and ctx->task_ctx_data are immaterial | ||
2567 | * since those values are always verified under | ||
2568 | * ctx->lock which we're now holding. | ||
2569 | */ | ||
2570 | RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); | ||
2571 | RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); | ||
2572 | |||
2552 | do_switch = 0; | 2573 | do_switch = 0; |
2553 | 2574 | ||
2554 | perf_event_sync_stat(ctx, next_ctx); | 2575 | perf_event_sync_stat(ctx, next_ctx); |
@@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
8545 | } | 8566 | } |
8546 | 8567 | ||
8547 | /* Mark owner so we could distinguish it from user events. */ | 8568 | /* Mark owner so we could distinguish it from user events. */ |
8548 | event->owner = EVENT_OWNER_KERNEL; | 8569 | event->owner = TASK_TOMBSTONE; |
8549 | 8570 | ||
8550 | account_event(event); | 8571 | account_event(event); |
8551 | 8572 | ||
@@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
8725 | 8746 | ||
8726 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 8747 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
8727 | { | 8748 | { |
8728 | struct perf_event *child_event, *next; | ||
8729 | struct perf_event_context *child_ctx, *clone_ctx = NULL; | 8749 | struct perf_event_context *child_ctx, *clone_ctx = NULL; |
8750 | struct perf_event *child_event, *next; | ||
8751 | unsigned long flags; | ||
8730 | 8752 | ||
8731 | if (likely(!child->perf_event_ctxp[ctxn])) | 8753 | WARN_ON_ONCE(child != current); |
8754 | |||
8755 | child_ctx = perf_lock_task_context(child, ctxn, &flags); | ||
8756 | if (!child_ctx) | ||
8732 | return; | 8757 | return; |
8733 | 8758 | ||
8734 | local_irq_disable(); | 8759 | task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); |
8735 | WARN_ON_ONCE(child != current); | ||
8736 | /* | ||
8737 | * We can't reschedule here because interrupts are disabled, | ||
8738 | * and child must be current. | ||
8739 | */ | ||
8740 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); | ||
8741 | 8760 | ||
8742 | /* | 8761 | /* |
8743 | * Take the context lock here so that if find_get_context is | 8762 | * Now that the context is inactive, destroy the task <-> ctx relation |
8744 | * reading child->perf_event_ctxp, we wait until it has | 8763 | * and mark the context dead. |
8745 | * incremented the context's refcount before we do put_ctx below. | ||
8746 | */ | 8764 | */ |
8747 | raw_spin_lock(&child_ctx->lock); | 8765 | RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); |
8748 | task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); | 8766 | put_ctx(child_ctx); /* cannot be last */ |
8749 | child->perf_event_ctxp[ctxn] = NULL; | 8767 | WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); |
8768 | put_task_struct(current); /* cannot be last */ | ||
8750 | 8769 | ||
8751 | /* | 8770 | /* |
8752 | * If this context is a clone; unclone it so it can't get | 8771 | * If this context is a clone; unclone it so it can't get |
@@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
8755 | */ | 8774 | */ |
8756 | clone_ctx = unclone_ctx(child_ctx); | 8775 | clone_ctx = unclone_ctx(child_ctx); |
8757 | update_context_time(child_ctx); | 8776 | update_context_time(child_ctx); |
8758 | raw_spin_unlock_irq(&child_ctx->lock); | 8777 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
8759 | 8778 | ||
8760 | if (clone_ctx) | 8779 | if (clone_ctx) |
8761 | put_ctx(clone_ctx); | 8780 | put_ctx(clone_ctx); |