aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c205
1 files changed, 145 insertions, 60 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f91aaf..52e5a15321d8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
103 atomic_inc(&ctx->refcount); 103 atomic_inc(&ctx->refcount);
104} 104}
105 105
106static void free_ctx(struct rcu_head *head)
107{
108 struct perf_counter_context *ctx;
109
110 ctx = container_of(head, struct perf_counter_context, rcu_head);
111 kfree(ctx);
112}
113
106static void put_ctx(struct perf_counter_context *ctx) 114static void put_ctx(struct perf_counter_context *ctx)
107{ 115{
108 if (atomic_dec_and_test(&ctx->refcount)) { 116 if (atomic_dec_and_test(&ctx->refcount)) {
109 if (ctx->parent_ctx) 117 if (ctx->parent_ctx)
110 put_ctx(ctx->parent_ctx); 118 put_ctx(ctx->parent_ctx);
111 kfree(ctx); 119 if (ctx->task)
120 put_task_struct(ctx->task);
121 call_rcu(&ctx->rcu_head, free_ctx);
112 } 122 }
113} 123}
114 124
@@ -212,22 +222,6 @@ group_sched_out(struct perf_counter *group_counter,
212} 222}
213 223
214/* 224/*
215 * Mark this context as not being a clone of another.
216 * Called when counters are added to or removed from this context.
217 * We also increment our generation number so that anything that
218 * was cloned from this context before this will not match anything
219 * cloned from this context after this.
220 */
221static void unclone_ctx(struct perf_counter_context *ctx)
222{
223 ++ctx->generation;
224 if (!ctx->parent_ctx)
225 return;
226 put_ctx(ctx->parent_ctx);
227 ctx->parent_ctx = NULL;
228}
229
230/*
231 * Cross CPU call to remove a performance counter 225 * Cross CPU call to remove a performance counter
232 * 226 *
233 * We disable the counter on the hardware level first. After that we 227 * We disable the counter on the hardware level first. After that we
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
281 * 275 *
282 * CPU counters are removed with a smp call. For task counters we only 276 * CPU counters are removed with a smp call. For task counters we only
283 * call when the task is on a CPU. 277 * call when the task is on a CPU.
278 *
279 * If counter->ctx is a cloned context, callers must make sure that
280 * every task struct that counter->ctx->task could possibly point to
281 * remains valid. This is OK when called from perf_release since
282 * that only calls us on the top-level context, which can't be a clone.
283 * When called from perf_counter_exit_task, it's OK because the
284 * context has been detached from its task.
284 */ 285 */
285static void perf_counter_remove_from_context(struct perf_counter *counter) 286static void perf_counter_remove_from_context(struct perf_counter *counter)
286{ 287{
287 struct perf_counter_context *ctx = counter->ctx; 288 struct perf_counter_context *ctx = counter->ctx;
288 struct task_struct *task = ctx->task; 289 struct task_struct *task = ctx->task;
289 290
290 unclone_ctx(ctx);
291 if (!task) { 291 if (!task) {
292 /* 292 /*
293 * Per cpu counters are removed via an smp call and 293 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
410 410
411/* 411/*
412 * Disable a counter. 412 * Disable a counter.
413 *
414 * If counter->ctx is a cloned context, callers must make sure that
415 * every task struct that counter->ctx->task could possibly point to
416 * remains valid. This condition is satisifed when called through
417 * perf_counter_for_each_child or perf_counter_for_each because they
418 * hold the top-level counter's child_mutex, so any descendant that
419 * goes to exit will block in sync_child_counter.
420 * When called from perf_pending_counter it's OK because counter->ctx
421 * is the current context on this CPU and preemption is disabled,
422 * hence we can't get into perf_counter_task_sched_out for this context.
413 */ 423 */
414static void perf_counter_disable(struct perf_counter *counter) 424static void perf_counter_disable(struct perf_counter *counter)
415{ 425{
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
794 804
795/* 805/*
796 * Enable a counter. 806 * Enable a counter.
807 *
808 * If counter->ctx is a cloned context, callers must make sure that
809 * every task struct that counter->ctx->task could possibly point to
810 * remains valid. This condition is satisfied when called through
811 * perf_counter_for_each_child or perf_counter_for_each as described
812 * for perf_counter_disable.
797 */ 813 */
798static void perf_counter_enable(struct perf_counter *counter) 814static void perf_counter_enable(struct perf_counter *counter)
799{ 815{
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
923 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 939 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
924 struct perf_counter_context *ctx = task->perf_counter_ctxp; 940 struct perf_counter_context *ctx = task->perf_counter_ctxp;
925 struct perf_counter_context *next_ctx; 941 struct perf_counter_context *next_ctx;
942 struct perf_counter_context *parent;
926 struct pt_regs *regs; 943 struct pt_regs *regs;
944 int do_switch = 1;
927 945
928 regs = task_pt_regs(task); 946 regs = task_pt_regs(task);
929 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); 947 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
932 return; 950 return;
933 951
934 update_context_time(ctx); 952 update_context_time(ctx);
953
954 rcu_read_lock();
955 parent = rcu_dereference(ctx->parent_ctx);
935 next_ctx = next->perf_counter_ctxp; 956 next_ctx = next->perf_counter_ctxp;
936 if (next_ctx && context_equiv(ctx, next_ctx)) { 957 if (parent && next_ctx &&
937 task->perf_counter_ctxp = next_ctx; 958 rcu_dereference(next_ctx->parent_ctx) == parent) {
938 next->perf_counter_ctxp = ctx; 959 /*
939 ctx->task = next; 960 * Looks like the two contexts are clones, so we might be
940 next_ctx->task = task; 961 * able to optimize the context switch. We lock both
941 return; 962 * contexts and check that they are clones under the
963 * lock (including re-checking that neither has been
964 * uncloned in the meantime). It doesn't matter which
965 * order we take the locks because no other cpu could
966 * be trying to lock both of these tasks.
967 */
968 spin_lock(&ctx->lock);
969 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
970 if (context_equiv(ctx, next_ctx)) {
971 task->perf_counter_ctxp = next_ctx;
972 next->perf_counter_ctxp = ctx;
973 ctx->task = next;
974 next_ctx->task = task;
975 do_switch = 0;
976 }
977 spin_unlock(&next_ctx->lock);
978 spin_unlock(&ctx->lock);
942 } 979 }
980 rcu_read_unlock();
943 981
944 __perf_counter_sched_out(ctx, cpuctx); 982 if (do_switch) {
945 983 __perf_counter_sched_out(ctx, cpuctx);
946 cpuctx->task_ctx = NULL; 984 cpuctx->task_ctx = NULL;
985 }
947} 986}
948 987
949static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) 988static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1215 ctx->task = task; 1254 ctx->task = task;
1216} 1255}
1217 1256
1218static void put_context(struct perf_counter_context *ctx)
1219{
1220 if (ctx->task)
1221 put_task_struct(ctx->task);
1222}
1223
1224static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1257static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1225{ 1258{
1226 struct perf_cpu_context *cpuctx; 1259 struct perf_cpu_context *cpuctx;
1227 struct perf_counter_context *ctx; 1260 struct perf_counter_context *ctx;
1228 struct perf_counter_context *tctx; 1261 struct perf_counter_context *parent_ctx;
1229 struct task_struct *task; 1262 struct task_struct *task;
1263 int err;
1230 1264
1231 /* 1265 /*
1232 * If cpu is not a wildcard then this is a percpu counter: 1266 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1249 1283
1250 cpuctx = &per_cpu(perf_cpu_context, cpu); 1284 cpuctx = &per_cpu(perf_cpu_context, cpu);
1251 ctx = &cpuctx->ctx; 1285 ctx = &cpuctx->ctx;
1286 get_ctx(ctx);
1252 1287
1253 return ctx; 1288 return ctx;
1254 } 1289 }
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1265 if (!task) 1300 if (!task)
1266 return ERR_PTR(-ESRCH); 1301 return ERR_PTR(-ESRCH);
1267 1302
1303 /*
1304 * Can't attach counters to a dying task.
1305 */
1306 err = -ESRCH;
1307 if (task->flags & PF_EXITING)
1308 goto errout;
1309
1268 /* Reuse ptrace permission checks for now. */ 1310 /* Reuse ptrace permission checks for now. */
1269 if (!ptrace_may_access(task, PTRACE_MODE_READ)) { 1311 err = -EACCES;
1270 put_task_struct(task); 1312 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1271 return ERR_PTR(-EACCES); 1313 goto errout;
1314
1315 retry_lock:
1316 rcu_read_lock();
1317 retry:
1318 ctx = rcu_dereference(task->perf_counter_ctxp);
1319 if (ctx) {
1320 /*
1321 * If this context is a clone of another, it might
1322 * get swapped for another underneath us by
1323 * perf_counter_task_sched_out, though the
1324 * rcu_read_lock() protects us from any context
1325 * getting freed. Lock the context and check if it
1326 * got swapped before we could get the lock, and retry
1327 * if so. If we locked the right context, then it
1328 * can't get swapped on us any more and we can
1329 * unclone it if necessary.
1330 * Once it's not a clone things will be stable.
1331 */
1332 spin_lock_irq(&ctx->lock);
1333 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
1334 spin_unlock_irq(&ctx->lock);
1335 goto retry;
1336 }
1337 parent_ctx = ctx->parent_ctx;
1338 if (parent_ctx) {
1339 put_ctx(parent_ctx);
1340 ctx->parent_ctx = NULL; /* no longer a clone */
1341 }
1342 ++ctx->generation;
1343 /*
1344 * Get an extra reference before dropping the lock so that
1345 * this context won't get freed if the task exits.
1346 */
1347 get_ctx(ctx);
1348 spin_unlock_irq(&ctx->lock);
1272 } 1349 }
1350 rcu_read_unlock();
1273 1351
1274 ctx = task->perf_counter_ctxp;
1275 if (!ctx) { 1352 if (!ctx) {
1276 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); 1353 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1277 if (!ctx) { 1354 err = -ENOMEM;
1278 put_task_struct(task); 1355 if (!ctx)
1279 return ERR_PTR(-ENOMEM); 1356 goto errout;
1280 }
1281 __perf_counter_init_context(ctx, task); 1357 __perf_counter_init_context(ctx, task);
1282 /* 1358 get_ctx(ctx);
1283 * Make sure other cpus see correct values for *ctx 1359 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1284 * once task->perf_counter_ctxp is visible to them.
1285 */
1286 smp_wmb();
1287 tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
1288 if (tctx) {
1289 /* 1360 /*
1290 * We raced with some other task; use 1361 * We raced with some other task; use
1291 * the context they set. 1362 * the context they set.
1292 */ 1363 */
1293 kfree(ctx); 1364 kfree(ctx);
1294 ctx = tctx; 1365 goto retry_lock;
1295 } 1366 }
1367 get_task_struct(task);
1296 } 1368 }
1297 1369
1370 put_task_struct(task);
1298 return ctx; 1371 return ctx;
1372
1373 errout:
1374 put_task_struct(task);
1375 return ERR_PTR(err);
1299} 1376}
1300 1377
1301static void free_counter_rcu(struct rcu_head *head) 1378static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head)
1303 struct perf_counter *counter; 1380 struct perf_counter *counter;
1304 1381
1305 counter = container_of(head, struct perf_counter, rcu_head); 1382 counter = container_of(head, struct perf_counter, rcu_head);
1306 put_ctx(counter->ctx);
1307 kfree(counter); 1383 kfree(counter);
1308} 1384}
1309 1385
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter)
1324 if (counter->destroy) 1400 if (counter->destroy)
1325 counter->destroy(counter); 1401 counter->destroy(counter);
1326 1402
1403 put_ctx(counter->ctx);
1327 call_rcu(&counter->rcu_head, free_counter_rcu); 1404 call_rcu(&counter->rcu_head, free_counter_rcu);
1328} 1405}
1329 1406
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file)
1347 put_task_struct(counter->owner); 1424 put_task_struct(counter->owner);
1348 1425
1349 free_counter(counter); 1426 free_counter(counter);
1350 put_context(ctx);
1351 1427
1352 return 0; 1428 return 0;
1353} 1429}
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
1437 mutex_unlock(&ctx->mutex); 1513 mutex_unlock(&ctx->mutex);
1438} 1514}
1439 1515
1516/*
1517 * Holding the top-level counter's child_mutex means that any
1518 * descendant process that has inherited this counter will block
1519 * in sync_child_counter if it goes to exit, thus satisfying the
1520 * task existence requirements of perf_counter_enable/disable.
1521 */
1440static void perf_counter_for_each_child(struct perf_counter *counter, 1522static void perf_counter_for_each_child(struct perf_counter *counter,
1441 void (*func)(struct perf_counter *)) 1523 void (*func)(struct perf_counter *))
1442{ 1524{
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
3124 counter->ctx = ctx; 3206 counter->ctx = ctx;
3125 counter->oncpu = -1; 3207 counter->oncpu = -1;
3126 3208
3127 get_ctx(ctx);
3128
3129 counter->state = PERF_COUNTER_STATE_INACTIVE; 3209 counter->state = PERF_COUNTER_STATE_INACTIVE;
3130 if (hw_event->disabled) 3210 if (hw_event->disabled)
3131 counter->state = PERF_COUNTER_STATE_OFF; 3211 counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3370,7 @@ err_free_put_context:
3290 kfree(counter); 3370 kfree(counter);
3291 3371
3292err_put_context: 3372err_put_context:
3293 put_context(ctx); 3373 put_ctx(ctx);
3294 3374
3295 goto out_fput; 3375 goto out_fput;
3296} 3376}
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter,
3322 group_leader, GFP_KERNEL); 3402 group_leader, GFP_KERNEL);
3323 if (IS_ERR(child_counter)) 3403 if (IS_ERR(child_counter))
3324 return child_counter; 3404 return child_counter;
3405 get_ctx(child_ctx);
3325 3406
3326 /* 3407 /*
3327 * Make the child state follow the state of the parent counter, 3408 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child,
3439 3520
3440/* 3521/*
3441 * When a child task exits, feed back counter values to parent counters. 3522 * When a child task exits, feed back counter values to parent counters.
3442 *
3443 * Note: we may be running in child context, but the PID is not hashed
3444 * anymore so new counters will not be added.
3445 * (XXX not sure that is true when we get called from flush_old_exec.
3446 * -- paulus)
3447 */ 3523 */
3448void perf_counter_exit_task(struct task_struct *child) 3524void perf_counter_exit_task(struct task_struct *child)
3449{ 3525{
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child)
3458 3534
3459 local_irq_save(flags); 3535 local_irq_save(flags);
3460 __perf_counter_task_sched_out(child_ctx); 3536 __perf_counter_task_sched_out(child_ctx);
3537
3538 /*
3539 * Take the context lock here so that if find_get_context is
3540 * reading child->perf_counter_ctxp, we wait until it has
3541 * incremented the context's refcount before we do put_ctx below.
3542 */
3543 spin_lock(&child_ctx->lock);
3461 child->perf_counter_ctxp = NULL; 3544 child->perf_counter_ctxp = NULL;
3545 spin_unlock(&child_ctx->lock);
3462 local_irq_restore(flags); 3546 local_irq_restore(flags);
3463 3547
3464 mutex_lock(&child_ctx->mutex); 3548 mutex_lock(&child_ctx->mutex);
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child)
3513 3597
3514 __perf_counter_init_context(child_ctx, child); 3598 __perf_counter_init_context(child_ctx, child);
3515 child->perf_counter_ctxp = child_ctx; 3599 child->perf_counter_ctxp = child_ctx;
3600 get_task_struct(child);
3516 3601
3517 /* 3602 /*
3518 * Lock the parent list. No need to lock the child - not PID 3603 * Lock the parent list. No need to lock the child - not PID