diff options
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r-- | kernel/perf_counter.c | 205 |
1 files changed, 145 insertions, 60 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 367299f91aaf..52e5a15321d8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx) | |||
103 | atomic_inc(&ctx->refcount); | 103 | atomic_inc(&ctx->refcount); |
104 | } | 104 | } |
105 | 105 | ||
106 | static void free_ctx(struct rcu_head *head) | ||
107 | { | ||
108 | struct perf_counter_context *ctx; | ||
109 | |||
110 | ctx = container_of(head, struct perf_counter_context, rcu_head); | ||
111 | kfree(ctx); | ||
112 | } | ||
113 | |||
106 | static void put_ctx(struct perf_counter_context *ctx) | 114 | static void put_ctx(struct perf_counter_context *ctx) |
107 | { | 115 | { |
108 | if (atomic_dec_and_test(&ctx->refcount)) { | 116 | if (atomic_dec_and_test(&ctx->refcount)) { |
109 | if (ctx->parent_ctx) | 117 | if (ctx->parent_ctx) |
110 | put_ctx(ctx->parent_ctx); | 118 | put_ctx(ctx->parent_ctx); |
111 | kfree(ctx); | 119 | if (ctx->task) |
120 | put_task_struct(ctx->task); | ||
121 | call_rcu(&ctx->rcu_head, free_ctx); | ||
112 | } | 122 | } |
113 | } | 123 | } |
114 | 124 | ||
@@ -212,22 +222,6 @@ group_sched_out(struct perf_counter *group_counter, | |||
212 | } | 222 | } |
213 | 223 | ||
214 | /* | 224 | /* |
215 | * Mark this context as not being a clone of another. | ||
216 | * Called when counters are added to or removed from this context. | ||
217 | * We also increment our generation number so that anything that | ||
218 | * was cloned from this context before this will not match anything | ||
219 | * cloned from this context after this. | ||
220 | */ | ||
221 | static void unclone_ctx(struct perf_counter_context *ctx) | ||
222 | { | ||
223 | ++ctx->generation; | ||
224 | if (!ctx->parent_ctx) | ||
225 | return; | ||
226 | put_ctx(ctx->parent_ctx); | ||
227 | ctx->parent_ctx = NULL; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Cross CPU call to remove a performance counter | 225 | * Cross CPU call to remove a performance counter |
232 | * | 226 | * |
233 | * We disable the counter on the hardware level first. After that we | 227 | * We disable the counter on the hardware level first. After that we |
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info) | |||
281 | * | 275 | * |
282 | * CPU counters are removed with a smp call. For task counters we only | 276 | * CPU counters are removed with a smp call. For task counters we only |
283 | * call when the task is on a CPU. | 277 | * call when the task is on a CPU. |
278 | * | ||
279 | * If counter->ctx is a cloned context, callers must make sure that | ||
280 | * every task struct that counter->ctx->task could possibly point to | ||
281 | * remains valid. This is OK when called from perf_release since | ||
282 | * that only calls us on the top-level context, which can't be a clone. | ||
283 | * When called from perf_counter_exit_task, it's OK because the | ||
284 | * context has been detached from its task. | ||
284 | */ | 285 | */ |
285 | static void perf_counter_remove_from_context(struct perf_counter *counter) | 286 | static void perf_counter_remove_from_context(struct perf_counter *counter) |
286 | { | 287 | { |
287 | struct perf_counter_context *ctx = counter->ctx; | 288 | struct perf_counter_context *ctx = counter->ctx; |
288 | struct task_struct *task = ctx->task; | 289 | struct task_struct *task = ctx->task; |
289 | 290 | ||
290 | unclone_ctx(ctx); | ||
291 | if (!task) { | 291 | if (!task) { |
292 | /* | 292 | /* |
293 | * Per cpu counters are removed via an smp call and | 293 | * Per cpu counters are removed via an smp call and |
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info) | |||
410 | 410 | ||
411 | /* | 411 | /* |
412 | * Disable a counter. | 412 | * Disable a counter. |
413 | * | ||
414 | * If counter->ctx is a cloned context, callers must make sure that | ||
415 | * every task struct that counter->ctx->task could possibly point to | ||
416 | * remains valid. This condition is satisifed when called through | ||
417 | * perf_counter_for_each_child or perf_counter_for_each because they | ||
418 | * hold the top-level counter's child_mutex, so any descendant that | ||
419 | * goes to exit will block in sync_child_counter. | ||
420 | * When called from perf_pending_counter it's OK because counter->ctx | ||
421 | * is the current context on this CPU and preemption is disabled, | ||
422 | * hence we can't get into perf_counter_task_sched_out for this context. | ||
413 | */ | 423 | */ |
414 | static void perf_counter_disable(struct perf_counter *counter) | 424 | static void perf_counter_disable(struct perf_counter *counter) |
415 | { | 425 | { |
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info) | |||
794 | 804 | ||
795 | /* | 805 | /* |
796 | * Enable a counter. | 806 | * Enable a counter. |
807 | * | ||
808 | * If counter->ctx is a cloned context, callers must make sure that | ||
809 | * every task struct that counter->ctx->task could possibly point to | ||
810 | * remains valid. This condition is satisfied when called through | ||
811 | * perf_counter_for_each_child or perf_counter_for_each as described | ||
812 | * for perf_counter_disable. | ||
797 | */ | 813 | */ |
798 | static void perf_counter_enable(struct perf_counter *counter) | 814 | static void perf_counter_enable(struct perf_counter *counter) |
799 | { | 815 | { |
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task, | |||
923 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 939 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); |
924 | struct perf_counter_context *ctx = task->perf_counter_ctxp; | 940 | struct perf_counter_context *ctx = task->perf_counter_ctxp; |
925 | struct perf_counter_context *next_ctx; | 941 | struct perf_counter_context *next_ctx; |
942 | struct perf_counter_context *parent; | ||
926 | struct pt_regs *regs; | 943 | struct pt_regs *regs; |
944 | int do_switch = 1; | ||
927 | 945 | ||
928 | regs = task_pt_regs(task); | 946 | regs = task_pt_regs(task); |
929 | perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); | 947 | perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); |
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task, | |||
932 | return; | 950 | return; |
933 | 951 | ||
934 | update_context_time(ctx); | 952 | update_context_time(ctx); |
953 | |||
954 | rcu_read_lock(); | ||
955 | parent = rcu_dereference(ctx->parent_ctx); | ||
935 | next_ctx = next->perf_counter_ctxp; | 956 | next_ctx = next->perf_counter_ctxp; |
936 | if (next_ctx && context_equiv(ctx, next_ctx)) { | 957 | if (parent && next_ctx && |
937 | task->perf_counter_ctxp = next_ctx; | 958 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
938 | next->perf_counter_ctxp = ctx; | 959 | /* |
939 | ctx->task = next; | 960 | * Looks like the two contexts are clones, so we might be |
940 | next_ctx->task = task; | 961 | * able to optimize the context switch. We lock both |
941 | return; | 962 | * contexts and check that they are clones under the |
963 | * lock (including re-checking that neither has been | ||
964 | * uncloned in the meantime). It doesn't matter which | ||
965 | * order we take the locks because no other cpu could | ||
966 | * be trying to lock both of these tasks. | ||
967 | */ | ||
968 | spin_lock(&ctx->lock); | ||
969 | spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); | ||
970 | if (context_equiv(ctx, next_ctx)) { | ||
971 | task->perf_counter_ctxp = next_ctx; | ||
972 | next->perf_counter_ctxp = ctx; | ||
973 | ctx->task = next; | ||
974 | next_ctx->task = task; | ||
975 | do_switch = 0; | ||
976 | } | ||
977 | spin_unlock(&next_ctx->lock); | ||
978 | spin_unlock(&ctx->lock); | ||
942 | } | 979 | } |
980 | rcu_read_unlock(); | ||
943 | 981 | ||
944 | __perf_counter_sched_out(ctx, cpuctx); | 982 | if (do_switch) { |
945 | 983 | __perf_counter_sched_out(ctx, cpuctx); | |
946 | cpuctx->task_ctx = NULL; | 984 | cpuctx->task_ctx = NULL; |
985 | } | ||
947 | } | 986 | } |
948 | 987 | ||
949 | static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) | 988 | static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) |
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx, | |||
1215 | ctx->task = task; | 1254 | ctx->task = task; |
1216 | } | 1255 | } |
1217 | 1256 | ||
1218 | static void put_context(struct perf_counter_context *ctx) | ||
1219 | { | ||
1220 | if (ctx->task) | ||
1221 | put_task_struct(ctx->task); | ||
1222 | } | ||
1223 | |||
1224 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | 1257 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) |
1225 | { | 1258 | { |
1226 | struct perf_cpu_context *cpuctx; | 1259 | struct perf_cpu_context *cpuctx; |
1227 | struct perf_counter_context *ctx; | 1260 | struct perf_counter_context *ctx; |
1228 | struct perf_counter_context *tctx; | 1261 | struct perf_counter_context *parent_ctx; |
1229 | struct task_struct *task; | 1262 | struct task_struct *task; |
1263 | int err; | ||
1230 | 1264 | ||
1231 | /* | 1265 | /* |
1232 | * If cpu is not a wildcard then this is a percpu counter: | 1266 | * If cpu is not a wildcard then this is a percpu counter: |
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | |||
1249 | 1283 | ||
1250 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 1284 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
1251 | ctx = &cpuctx->ctx; | 1285 | ctx = &cpuctx->ctx; |
1286 | get_ctx(ctx); | ||
1252 | 1287 | ||
1253 | return ctx; | 1288 | return ctx; |
1254 | } | 1289 | } |
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | |||
1265 | if (!task) | 1300 | if (!task) |
1266 | return ERR_PTR(-ESRCH); | 1301 | return ERR_PTR(-ESRCH); |
1267 | 1302 | ||
1303 | /* | ||
1304 | * Can't attach counters to a dying task. | ||
1305 | */ | ||
1306 | err = -ESRCH; | ||
1307 | if (task->flags & PF_EXITING) | ||
1308 | goto errout; | ||
1309 | |||
1268 | /* Reuse ptrace permission checks for now. */ | 1310 | /* Reuse ptrace permission checks for now. */ |
1269 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | 1311 | err = -EACCES; |
1270 | put_task_struct(task); | 1312 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1271 | return ERR_PTR(-EACCES); | 1313 | goto errout; |
1314 | |||
1315 | retry_lock: | ||
1316 | rcu_read_lock(); | ||
1317 | retry: | ||
1318 | ctx = rcu_dereference(task->perf_counter_ctxp); | ||
1319 | if (ctx) { | ||
1320 | /* | ||
1321 | * If this context is a clone of another, it might | ||
1322 | * get swapped for another underneath us by | ||
1323 | * perf_counter_task_sched_out, though the | ||
1324 | * rcu_read_lock() protects us from any context | ||
1325 | * getting freed. Lock the context and check if it | ||
1326 | * got swapped before we could get the lock, and retry | ||
1327 | * if so. If we locked the right context, then it | ||
1328 | * can't get swapped on us any more and we can | ||
1329 | * unclone it if necessary. | ||
1330 | * Once it's not a clone things will be stable. | ||
1331 | */ | ||
1332 | spin_lock_irq(&ctx->lock); | ||
1333 | if (ctx != rcu_dereference(task->perf_counter_ctxp)) { | ||
1334 | spin_unlock_irq(&ctx->lock); | ||
1335 | goto retry; | ||
1336 | } | ||
1337 | parent_ctx = ctx->parent_ctx; | ||
1338 | if (parent_ctx) { | ||
1339 | put_ctx(parent_ctx); | ||
1340 | ctx->parent_ctx = NULL; /* no longer a clone */ | ||
1341 | } | ||
1342 | ++ctx->generation; | ||
1343 | /* | ||
1344 | * Get an extra reference before dropping the lock so that | ||
1345 | * this context won't get freed if the task exits. | ||
1346 | */ | ||
1347 | get_ctx(ctx); | ||
1348 | spin_unlock_irq(&ctx->lock); | ||
1272 | } | 1349 | } |
1350 | rcu_read_unlock(); | ||
1273 | 1351 | ||
1274 | ctx = task->perf_counter_ctxp; | ||
1275 | if (!ctx) { | 1352 | if (!ctx) { |
1276 | ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); | 1353 | ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); |
1277 | if (!ctx) { | 1354 | err = -ENOMEM; |
1278 | put_task_struct(task); | 1355 | if (!ctx) |
1279 | return ERR_PTR(-ENOMEM); | 1356 | goto errout; |
1280 | } | ||
1281 | __perf_counter_init_context(ctx, task); | 1357 | __perf_counter_init_context(ctx, task); |
1282 | /* | 1358 | get_ctx(ctx); |
1283 | * Make sure other cpus see correct values for *ctx | 1359 | if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { |
1284 | * once task->perf_counter_ctxp is visible to them. | ||
1285 | */ | ||
1286 | smp_wmb(); | ||
1287 | tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); | ||
1288 | if (tctx) { | ||
1289 | /* | 1360 | /* |
1290 | * We raced with some other task; use | 1361 | * We raced with some other task; use |
1291 | * the context they set. | 1362 | * the context they set. |
1292 | */ | 1363 | */ |
1293 | kfree(ctx); | 1364 | kfree(ctx); |
1294 | ctx = tctx; | 1365 | goto retry_lock; |
1295 | } | 1366 | } |
1367 | get_task_struct(task); | ||
1296 | } | 1368 | } |
1297 | 1369 | ||
1370 | put_task_struct(task); | ||
1298 | return ctx; | 1371 | return ctx; |
1372 | |||
1373 | errout: | ||
1374 | put_task_struct(task); | ||
1375 | return ERR_PTR(err); | ||
1299 | } | 1376 | } |
1300 | 1377 | ||
1301 | static void free_counter_rcu(struct rcu_head *head) | 1378 | static void free_counter_rcu(struct rcu_head *head) |
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head) | |||
1303 | struct perf_counter *counter; | 1380 | struct perf_counter *counter; |
1304 | 1381 | ||
1305 | counter = container_of(head, struct perf_counter, rcu_head); | 1382 | counter = container_of(head, struct perf_counter, rcu_head); |
1306 | put_ctx(counter->ctx); | ||
1307 | kfree(counter); | 1383 | kfree(counter); |
1308 | } | 1384 | } |
1309 | 1385 | ||
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter) | |||
1324 | if (counter->destroy) | 1400 | if (counter->destroy) |
1325 | counter->destroy(counter); | 1401 | counter->destroy(counter); |
1326 | 1402 | ||
1403 | put_ctx(counter->ctx); | ||
1327 | call_rcu(&counter->rcu_head, free_counter_rcu); | 1404 | call_rcu(&counter->rcu_head, free_counter_rcu); |
1328 | } | 1405 | } |
1329 | 1406 | ||
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file) | |||
1347 | put_task_struct(counter->owner); | 1424 | put_task_struct(counter->owner); |
1348 | 1425 | ||
1349 | free_counter(counter); | 1426 | free_counter(counter); |
1350 | put_context(ctx); | ||
1351 | 1427 | ||
1352 | return 0; | 1428 | return 0; |
1353 | } | 1429 | } |
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, | |||
1437 | mutex_unlock(&ctx->mutex); | 1513 | mutex_unlock(&ctx->mutex); |
1438 | } | 1514 | } |
1439 | 1515 | ||
1516 | /* | ||
1517 | * Holding the top-level counter's child_mutex means that any | ||
1518 | * descendant process that has inherited this counter will block | ||
1519 | * in sync_child_counter if it goes to exit, thus satisfying the | ||
1520 | * task existence requirements of perf_counter_enable/disable. | ||
1521 | */ | ||
1440 | static void perf_counter_for_each_child(struct perf_counter *counter, | 1522 | static void perf_counter_for_each_child(struct perf_counter *counter, |
1441 | void (*func)(struct perf_counter *)) | 1523 | void (*func)(struct perf_counter *)) |
1442 | { | 1524 | { |
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
3124 | counter->ctx = ctx; | 3206 | counter->ctx = ctx; |
3125 | counter->oncpu = -1; | 3207 | counter->oncpu = -1; |
3126 | 3208 | ||
3127 | get_ctx(ctx); | ||
3128 | |||
3129 | counter->state = PERF_COUNTER_STATE_INACTIVE; | 3209 | counter->state = PERF_COUNTER_STATE_INACTIVE; |
3130 | if (hw_event->disabled) | 3210 | if (hw_event->disabled) |
3131 | counter->state = PERF_COUNTER_STATE_OFF; | 3211 | counter->state = PERF_COUNTER_STATE_OFF; |
@@ -3290,7 +3370,7 @@ err_free_put_context: | |||
3290 | kfree(counter); | 3370 | kfree(counter); |
3291 | 3371 | ||
3292 | err_put_context: | 3372 | err_put_context: |
3293 | put_context(ctx); | 3373 | put_ctx(ctx); |
3294 | 3374 | ||
3295 | goto out_fput; | 3375 | goto out_fput; |
3296 | } | 3376 | } |
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter, | |||
3322 | group_leader, GFP_KERNEL); | 3402 | group_leader, GFP_KERNEL); |
3323 | if (IS_ERR(child_counter)) | 3403 | if (IS_ERR(child_counter)) |
3324 | return child_counter; | 3404 | return child_counter; |
3405 | get_ctx(child_ctx); | ||
3325 | 3406 | ||
3326 | /* | 3407 | /* |
3327 | * Make the child state follow the state of the parent counter, | 3408 | * Make the child state follow the state of the parent counter, |
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child, | |||
3439 | 3520 | ||
3440 | /* | 3521 | /* |
3441 | * When a child task exits, feed back counter values to parent counters. | 3522 | * When a child task exits, feed back counter values to parent counters. |
3442 | * | ||
3443 | * Note: we may be running in child context, but the PID is not hashed | ||
3444 | * anymore so new counters will not be added. | ||
3445 | * (XXX not sure that is true when we get called from flush_old_exec. | ||
3446 | * -- paulus) | ||
3447 | */ | 3523 | */ |
3448 | void perf_counter_exit_task(struct task_struct *child) | 3524 | void perf_counter_exit_task(struct task_struct *child) |
3449 | { | 3525 | { |
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child) | |||
3458 | 3534 | ||
3459 | local_irq_save(flags); | 3535 | local_irq_save(flags); |
3460 | __perf_counter_task_sched_out(child_ctx); | 3536 | __perf_counter_task_sched_out(child_ctx); |
3537 | |||
3538 | /* | ||
3539 | * Take the context lock here so that if find_get_context is | ||
3540 | * reading child->perf_counter_ctxp, we wait until it has | ||
3541 | * incremented the context's refcount before we do put_ctx below. | ||
3542 | */ | ||
3543 | spin_lock(&child_ctx->lock); | ||
3461 | child->perf_counter_ctxp = NULL; | 3544 | child->perf_counter_ctxp = NULL; |
3545 | spin_unlock(&child_ctx->lock); | ||
3462 | local_irq_restore(flags); | 3546 | local_irq_restore(flags); |
3463 | 3547 | ||
3464 | mutex_lock(&child_ctx->mutex); | 3548 | mutex_lock(&child_ctx->mutex); |
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child) | |||
3513 | 3597 | ||
3514 | __perf_counter_init_context(child_ctx, child); | 3598 | __perf_counter_init_context(child_ctx, child); |
3515 | child->perf_counter_ctxp = child_ctx; | 3599 | child->perf_counter_ctxp = child_ctx; |
3600 | get_task_struct(child); | ||
3516 | 3601 | ||
3517 | /* | 3602 | /* |
3518 | * Lock the parent list. No need to lock the child - not PID | 3603 | * Lock the parent list. No need to lock the child - not PID |