aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-05-28 08:18:17 -0400
committerIngo Molnar <mingo@elte.hu>2009-05-28 09:03:50 -0400
commitc93f7669098eb97c5376e5396e3dfb734c17df4f (patch)
tree0c17f812277320fcefd4f441e1db7a7f862752a5
parent63299f057fbce47da895e8865cba7e9c3eb01a20 (diff)
perf_counter: Fix race in attaching counters to tasks and exiting
Commit 564c2b21 ("perf_counter: Optimize context switch between identical inherited contexts") introduced a race where it is possible that a counter being attached to a task could get attached to the wrong task, if the task is one that has inherited its context from another task via fork. This happens because the optimized context switch could switch the context to another task after find_get_context has read task->perf_counter_ctxp. In fact, it's possible that the context could then get freed, if the other task then exits. This fixes the problem by protecting both the context switch and the critical code in find_get_context with spinlocks. The context switch locks the cxt->lock of both the outgoing and incoming contexts before swapping them. That means that once code such as find_get_context has obtained the spinlock for the context associated with a task, the context can't get swapped to another task. However, the context may have been swapped in the interval between reading task->perf_counter_ctxp and getting the lock, so it is necessary to check and retry. To make sure that none of the contexts being looked at in find_get_context can get freed, this changes the context freeing code to use RCU. Thus an rcu_read_lock() is sufficient to ensure that no contexts can get freed. This part of the patch is lifted from a patch posted by Peter Zijlstra. This also adds a check to make sure that we can't add a counter to a task that is exiting. There is also a race between perf_counter_exit_task and find_get_context; this solves the race by moving the get_ctx that was in perf_counter_alloc into the locked region in find_get_context, so that once find_get_context has got the context for a task, it won't get freed even if the task calls perf_counter_exit_task. It doesn't matter if new top-level (non-inherited) counters get attached to the context after perf_counter_exit_task has detached the context from the task. They will just stay there and never get scheduled in until the counters' fds get closed, and then perf_release will remove them from the context and eventually free the context. With this, we are now doing the unclone in find_get_context rather than when a counter was added to or removed from a context (actually, we were missing the unclone_ctx() call when adding a counter to a context). We don't need to unclone when removing a counter from a context because we have no way to remove a counter from a cloned context. This also takes out the smp_wmb() in find_get_context, which Peter Zijlstra pointed out was unnecessary because the cmpxchg implies a full barrier anyway. Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: John Kacur <jkacur@redhat.com> LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/perf_counter.h5
-rw-r--r--kernel/perf_counter.c205
2 files changed, 148 insertions, 62 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a65ddc580514..717bf3b59ba4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
541 * been cloned (inherited) from a common ancestor. 541 * been cloned (inherited) from a common ancestor.
542 */ 542 */
543 struct perf_counter_context *parent_ctx; 543 struct perf_counter_context *parent_ctx;
544 u32 parent_gen; 544 u64 parent_gen;
545 u32 generation; 545 u64 generation;
546 struct rcu_head rcu_head;
546}; 547};
547 548
548/** 549/**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f91aaf..52e5a15321d8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
103 atomic_inc(&ctx->refcount); 103 atomic_inc(&ctx->refcount);
104} 104}
105 105
106static void free_ctx(struct rcu_head *head)
107{
108 struct perf_counter_context *ctx;
109
110 ctx = container_of(head, struct perf_counter_context, rcu_head);
111 kfree(ctx);
112}
113
106static void put_ctx(struct perf_counter_context *ctx) 114static void put_ctx(struct perf_counter_context *ctx)
107{ 115{
108 if (atomic_dec_and_test(&ctx->refcount)) { 116 if (atomic_dec_and_test(&ctx->refcount)) {
109 if (ctx->parent_ctx) 117 if (ctx->parent_ctx)
110 put_ctx(ctx->parent_ctx); 118 put_ctx(ctx->parent_ctx);
111 kfree(ctx); 119 if (ctx->task)
120 put_task_struct(ctx->task);
121 call_rcu(&ctx->rcu_head, free_ctx);
112 } 122 }
113} 123}
114 124
@@ -212,22 +222,6 @@ group_sched_out(struct perf_counter *group_counter,
212} 222}
213 223
214/* 224/*
215 * Mark this context as not being a clone of another.
216 * Called when counters are added to or removed from this context.
217 * We also increment our generation number so that anything that
218 * was cloned from this context before this will not match anything
219 * cloned from this context after this.
220 */
221static void unclone_ctx(struct perf_counter_context *ctx)
222{
223 ++ctx->generation;
224 if (!ctx->parent_ctx)
225 return;
226 put_ctx(ctx->parent_ctx);
227 ctx->parent_ctx = NULL;
228}
229
230/*
231 * Cross CPU call to remove a performance counter 225 * Cross CPU call to remove a performance counter
232 * 226 *
233 * We disable the counter on the hardware level first. After that we 227 * We disable the counter on the hardware level first. After that we
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
281 * 275 *
282 * CPU counters are removed with a smp call. For task counters we only 276 * CPU counters are removed with a smp call. For task counters we only
283 * call when the task is on a CPU. 277 * call when the task is on a CPU.
278 *
279 * If counter->ctx is a cloned context, callers must make sure that
280 * every task struct that counter->ctx->task could possibly point to
281 * remains valid. This is OK when called from perf_release since
282 * that only calls us on the top-level context, which can't be a clone.
283 * When called from perf_counter_exit_task, it's OK because the
284 * context has been detached from its task.
284 */ 285 */
285static void perf_counter_remove_from_context(struct perf_counter *counter) 286static void perf_counter_remove_from_context(struct perf_counter *counter)
286{ 287{
287 struct perf_counter_context *ctx = counter->ctx; 288 struct perf_counter_context *ctx = counter->ctx;
288 struct task_struct *task = ctx->task; 289 struct task_struct *task = ctx->task;
289 290
290 unclone_ctx(ctx);
291 if (!task) { 291 if (!task) {
292 /* 292 /*
293 * Per cpu counters are removed via an smp call and 293 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
410 410
411/* 411/*
412 * Disable a counter. 412 * Disable a counter.
413 *
414 * If counter->ctx is a cloned context, callers must make sure that
415 * every task struct that counter->ctx->task could possibly point to
416 * remains valid. This condition is satisifed when called through
417 * perf_counter_for_each_child or perf_counter_for_each because they
418 * hold the top-level counter's child_mutex, so any descendant that
419 * goes to exit will block in sync_child_counter.
420 * When called from perf_pending_counter it's OK because counter->ctx
421 * is the current context on this CPU and preemption is disabled,
422 * hence we can't get into perf_counter_task_sched_out for this context.
413 */ 423 */
414static void perf_counter_disable(struct perf_counter *counter) 424static void perf_counter_disable(struct perf_counter *counter)
415{ 425{
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
794 804
795/* 805/*
796 * Enable a counter. 806 * Enable a counter.
807 *
808 * If counter->ctx is a cloned context, callers must make sure that
809 * every task struct that counter->ctx->task could possibly point to
810 * remains valid. This condition is satisfied when called through
811 * perf_counter_for_each_child or perf_counter_for_each as described
812 * for perf_counter_disable.
797 */ 813 */
798static void perf_counter_enable(struct perf_counter *counter) 814static void perf_counter_enable(struct perf_counter *counter)
799{ 815{
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
923 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 939 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
924 struct perf_counter_context *ctx = task->perf_counter_ctxp; 940 struct perf_counter_context *ctx = task->perf_counter_ctxp;
925 struct perf_counter_context *next_ctx; 941 struct perf_counter_context *next_ctx;
942 struct perf_counter_context *parent;
926 struct pt_regs *regs; 943 struct pt_regs *regs;
944 int do_switch = 1;
927 945
928 regs = task_pt_regs(task); 946 regs = task_pt_regs(task);
929 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); 947 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
932 return; 950 return;
933 951
934 update_context_time(ctx); 952 update_context_time(ctx);
953
954 rcu_read_lock();
955 parent = rcu_dereference(ctx->parent_ctx);
935 next_ctx = next->perf_counter_ctxp; 956 next_ctx = next->perf_counter_ctxp;
936 if (next_ctx && context_equiv(ctx, next_ctx)) { 957 if (parent && next_ctx &&
937 task->perf_counter_ctxp = next_ctx; 958 rcu_dereference(next_ctx->parent_ctx) == parent) {
938 next->perf_counter_ctxp = ctx; 959 /*
939 ctx->task = next; 960 * Looks like the two contexts are clones, so we might be
940 next_ctx->task = task; 961 * able to optimize the context switch. We lock both
941 return; 962 * contexts and check that they are clones under the
963 * lock (including re-checking that neither has been
964 * uncloned in the meantime). It doesn't matter which
965 * order we take the locks because no other cpu could
966 * be trying to lock both of these tasks.
967 */
968 spin_lock(&ctx->lock);
969 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
970 if (context_equiv(ctx, next_ctx)) {
971 task->perf_counter_ctxp = next_ctx;
972 next->perf_counter_ctxp = ctx;
973 ctx->task = next;
974 next_ctx->task = task;
975 do_switch = 0;
976 }
977 spin_unlock(&next_ctx->lock);
978 spin_unlock(&ctx->lock);
942 } 979 }
980 rcu_read_unlock();
943 981
944 __perf_counter_sched_out(ctx, cpuctx); 982 if (do_switch) {
945 983 __perf_counter_sched_out(ctx, cpuctx);
946 cpuctx->task_ctx = NULL; 984 cpuctx->task_ctx = NULL;
985 }
947} 986}
948 987
949static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) 988static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1215 ctx->task = task; 1254 ctx->task = task;
1216} 1255}
1217 1256
1218static void put_context(struct perf_counter_context *ctx)
1219{
1220 if (ctx->task)
1221 put_task_struct(ctx->task);
1222}
1223
1224static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1257static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1225{ 1258{
1226 struct perf_cpu_context *cpuctx; 1259 struct perf_cpu_context *cpuctx;
1227 struct perf_counter_context *ctx; 1260 struct perf_counter_context *ctx;
1228 struct perf_counter_context *tctx; 1261 struct perf_counter_context *parent_ctx;
1229 struct task_struct *task; 1262 struct task_struct *task;
1263 int err;
1230 1264
1231 /* 1265 /*
1232 * If cpu is not a wildcard then this is a percpu counter: 1266 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1249 1283
1250 cpuctx = &per_cpu(perf_cpu_context, cpu); 1284 cpuctx = &per_cpu(perf_cpu_context, cpu);
1251 ctx = &cpuctx->ctx; 1285 ctx = &cpuctx->ctx;
1286 get_ctx(ctx);
1252 1287
1253 return ctx; 1288 return ctx;
1254 } 1289 }
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1265 if (!task) 1300 if (!task)
1266 return ERR_PTR(-ESRCH); 1301 return ERR_PTR(-ESRCH);
1267 1302
1303 /*
1304 * Can't attach counters to a dying task.
1305 */
1306 err = -ESRCH;
1307 if (task->flags & PF_EXITING)
1308 goto errout;
1309
1268 /* Reuse ptrace permission checks for now. */ 1310 /* Reuse ptrace permission checks for now. */
1269 if (!ptrace_may_access(task, PTRACE_MODE_READ)) { 1311 err = -EACCES;
1270 put_task_struct(task); 1312 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1271 return ERR_PTR(-EACCES); 1313 goto errout;
1314
1315 retry_lock:
1316 rcu_read_lock();
1317 retry:
1318 ctx = rcu_dereference(task->perf_counter_ctxp);
1319 if (ctx) {
1320 /*
1321 * If this context is a clone of another, it might
1322 * get swapped for another underneath us by
1323 * perf_counter_task_sched_out, though the
1324 * rcu_read_lock() protects us from any context
1325 * getting freed. Lock the context and check if it
1326 * got swapped before we could get the lock, and retry
1327 * if so. If we locked the right context, then it
1328 * can't get swapped on us any more and we can
1329 * unclone it if necessary.
1330 * Once it's not a clone things will be stable.
1331 */
1332 spin_lock_irq(&ctx->lock);
1333 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
1334 spin_unlock_irq(&ctx->lock);
1335 goto retry;
1336 }
1337 parent_ctx = ctx->parent_ctx;
1338 if (parent_ctx) {
1339 put_ctx(parent_ctx);
1340 ctx->parent_ctx = NULL; /* no longer a clone */
1341 }
1342 ++ctx->generation;
1343 /*
1344 * Get an extra reference before dropping the lock so that
1345 * this context won't get freed if the task exits.
1346 */
1347 get_ctx(ctx);
1348 spin_unlock_irq(&ctx->lock);
1272 } 1349 }
1350 rcu_read_unlock();
1273 1351
1274 ctx = task->perf_counter_ctxp;
1275 if (!ctx) { 1352 if (!ctx) {
1276 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); 1353 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1277 if (!ctx) { 1354 err = -ENOMEM;
1278 put_task_struct(task); 1355 if (!ctx)
1279 return ERR_PTR(-ENOMEM); 1356 goto errout;
1280 }
1281 __perf_counter_init_context(ctx, task); 1357 __perf_counter_init_context(ctx, task);
1282 /* 1358 get_ctx(ctx);
1283 * Make sure other cpus see correct values for *ctx 1359 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1284 * once task->perf_counter_ctxp is visible to them.
1285 */
1286 smp_wmb();
1287 tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
1288 if (tctx) {
1289 /* 1360 /*
1290 * We raced with some other task; use 1361 * We raced with some other task; use
1291 * the context they set. 1362 * the context they set.
1292 */ 1363 */
1293 kfree(ctx); 1364 kfree(ctx);
1294 ctx = tctx; 1365 goto retry_lock;
1295 } 1366 }
1367 get_task_struct(task);
1296 } 1368 }
1297 1369
1370 put_task_struct(task);
1298 return ctx; 1371 return ctx;
1372
1373 errout:
1374 put_task_struct(task);
1375 return ERR_PTR(err);
1299} 1376}
1300 1377
1301static void free_counter_rcu(struct rcu_head *head) 1378static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head)
1303 struct perf_counter *counter; 1380 struct perf_counter *counter;
1304 1381
1305 counter = container_of(head, struct perf_counter, rcu_head); 1382 counter = container_of(head, struct perf_counter, rcu_head);
1306 put_ctx(counter->ctx);
1307 kfree(counter); 1383 kfree(counter);
1308} 1384}
1309 1385
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter)
1324 if (counter->destroy) 1400 if (counter->destroy)
1325 counter->destroy(counter); 1401 counter->destroy(counter);
1326 1402
1403 put_ctx(counter->ctx);
1327 call_rcu(&counter->rcu_head, free_counter_rcu); 1404 call_rcu(&counter->rcu_head, free_counter_rcu);
1328} 1405}
1329 1406
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file)
1347 put_task_struct(counter->owner); 1424 put_task_struct(counter->owner);
1348 1425
1349 free_counter(counter); 1426 free_counter(counter);
1350 put_context(ctx);
1351 1427
1352 return 0; 1428 return 0;
1353} 1429}
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
1437 mutex_unlock(&ctx->mutex); 1513 mutex_unlock(&ctx->mutex);
1438} 1514}
1439 1515
1516/*
1517 * Holding the top-level counter's child_mutex means that any
1518 * descendant process that has inherited this counter will block
1519 * in sync_child_counter if it goes to exit, thus satisfying the
1520 * task existence requirements of perf_counter_enable/disable.
1521 */
1440static void perf_counter_for_each_child(struct perf_counter *counter, 1522static void perf_counter_for_each_child(struct perf_counter *counter,
1441 void (*func)(struct perf_counter *)) 1523 void (*func)(struct perf_counter *))
1442{ 1524{
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
3124 counter->ctx = ctx; 3206 counter->ctx = ctx;
3125 counter->oncpu = -1; 3207 counter->oncpu = -1;
3126 3208
3127 get_ctx(ctx);
3128
3129 counter->state = PERF_COUNTER_STATE_INACTIVE; 3209 counter->state = PERF_COUNTER_STATE_INACTIVE;
3130 if (hw_event->disabled) 3210 if (hw_event->disabled)
3131 counter->state = PERF_COUNTER_STATE_OFF; 3211 counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3370,7 @@ err_free_put_context:
3290 kfree(counter); 3370 kfree(counter);
3291 3371
3292err_put_context: 3372err_put_context:
3293 put_context(ctx); 3373 put_ctx(ctx);
3294 3374
3295 goto out_fput; 3375 goto out_fput;
3296} 3376}
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter,
3322 group_leader, GFP_KERNEL); 3402 group_leader, GFP_KERNEL);
3323 if (IS_ERR(child_counter)) 3403 if (IS_ERR(child_counter))
3324 return child_counter; 3404 return child_counter;
3405 get_ctx(child_ctx);
3325 3406
3326 /* 3407 /*
3327 * Make the child state follow the state of the parent counter, 3408 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child,
3439 3520
3440/* 3521/*
3441 * When a child task exits, feed back counter values to parent counters. 3522 * When a child task exits, feed back counter values to parent counters.
3442 *
3443 * Note: we may be running in child context, but the PID is not hashed
3444 * anymore so new counters will not be added.
3445 * (XXX not sure that is true when we get called from flush_old_exec.
3446 * -- paulus)
3447 */ 3523 */
3448void perf_counter_exit_task(struct task_struct *child) 3524void perf_counter_exit_task(struct task_struct *child)
3449{ 3525{
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child)
3458 3534
3459 local_irq_save(flags); 3535 local_irq_save(flags);
3460 __perf_counter_task_sched_out(child_ctx); 3536 __perf_counter_task_sched_out(child_ctx);
3537
3538 /*
3539 * Take the context lock here so that if find_get_context is
3540 * reading child->perf_counter_ctxp, we wait until it has
3541 * incremented the context's refcount before we do put_ctx below.
3542 */
3543 spin_lock(&child_ctx->lock);
3461 child->perf_counter_ctxp = NULL; 3544 child->perf_counter_ctxp = NULL;
3545 spin_unlock(&child_ctx->lock);
3462 local_irq_restore(flags); 3546 local_irq_restore(flags);
3463 3547
3464 mutex_lock(&child_ctx->mutex); 3548 mutex_lock(&child_ctx->mutex);
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child)
3513 3597
3514 __perf_counter_init_context(child_ctx, child); 3598 __perf_counter_init_context(child_ctx, child);
3515 child->perf_counter_ctxp = child_ctx; 3599 child->perf_counter_ctxp = child_ctx;
3600 get_task_struct(child);
3516 3601
3517 /* 3602 /*
3518 * Lock the parent list. No need to lock the child - not PID 3603 * Lock the parent list. No need to lock the child - not PID