diff options
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r-- | kernel/perf_counter.c | 194 |
1 files changed, 156 insertions, 38 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0bdb36ccfc8..8cb94a52d1bb 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -46,11 +46,17 @@ static atomic_t nr_task_counters __read_mostly; | |||
46 | 46 | ||
47 | /* | 47 | /* |
48 | * perf counter paranoia level: | 48 | * perf counter paranoia level: |
49 | * 0 - not paranoid | 49 | * -1 - not paranoid at all |
50 | * 1 - disallow cpu counters to unpriv | 50 | * 0 - disallow raw tracepoint access for unpriv |
51 | * 2 - disallow kernel profiling to unpriv | 51 | * 1 - disallow cpu counters for unpriv |
52 | * 2 - disallow kernel profiling for unpriv | ||
52 | */ | 53 | */ |
53 | int sysctl_perf_counter_paranoid __read_mostly; | 54 | int sysctl_perf_counter_paranoid __read_mostly = 1; |
55 | |||
56 | static inline bool perf_paranoid_tracepoint_raw(void) | ||
57 | { | ||
58 | return sysctl_perf_counter_paranoid > -1; | ||
59 | } | ||
54 | 60 | ||
55 | static inline bool perf_paranoid_cpu(void) | 61 | static inline bool perf_paranoid_cpu(void) |
56 | { | 62 | { |
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter) | |||
469 | struct perf_counter_context *ctx = counter->ctx; | 475 | struct perf_counter_context *ctx = counter->ctx; |
470 | u64 run_end; | 476 | u64 run_end; |
471 | 477 | ||
472 | if (counter->state < PERF_COUNTER_STATE_INACTIVE) | 478 | if (counter->state < PERF_COUNTER_STATE_INACTIVE || |
479 | counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) | ||
473 | return; | 480 | return; |
474 | 481 | ||
475 | counter->total_time_enabled = ctx->time - counter->tstamp_enabled; | 482 | counter->total_time_enabled = ctx->time - counter->tstamp_enabled; |
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info) | |||
518 | */ | 525 | */ |
519 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { | 526 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { |
520 | update_context_time(ctx); | 527 | update_context_time(ctx); |
521 | update_counter_times(counter); | 528 | update_group_times(counter); |
522 | if (counter == counter->group_leader) | 529 | if (counter == counter->group_leader) |
523 | group_sched_out(counter, cpuctx, ctx); | 530 | group_sched_out(counter, cpuctx, ctx); |
524 | else | 531 | else |
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter) | |||
573 | * in, so we can change the state safely. | 580 | * in, so we can change the state safely. |
574 | */ | 581 | */ |
575 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) { | 582 | if (counter->state == PERF_COUNTER_STATE_INACTIVE) { |
576 | update_counter_times(counter); | 583 | update_group_times(counter); |
577 | counter->state = PERF_COUNTER_STATE_OFF; | 584 | counter->state = PERF_COUNTER_STATE_OFF; |
578 | } | 585 | } |
579 | 586 | ||
@@ -851,6 +858,27 @@ retry: | |||
851 | } | 858 | } |
852 | 859 | ||
853 | /* | 860 | /* |
861 | * Put a counter into inactive state and update time fields. | ||
862 | * Enabling the leader of a group effectively enables all | ||
863 | * the group members that aren't explicitly disabled, so we | ||
864 | * have to update their ->tstamp_enabled also. | ||
865 | * Note: this works for group members as well as group leaders | ||
866 | * since the non-leader members' sibling_lists will be empty. | ||
867 | */ | ||
868 | static void __perf_counter_mark_enabled(struct perf_counter *counter, | ||
869 | struct perf_counter_context *ctx) | ||
870 | { | ||
871 | struct perf_counter *sub; | ||
872 | |||
873 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
874 | counter->tstamp_enabled = ctx->time - counter->total_time_enabled; | ||
875 | list_for_each_entry(sub, &counter->sibling_list, list_entry) | ||
876 | if (sub->state >= PERF_COUNTER_STATE_INACTIVE) | ||
877 | sub->tstamp_enabled = | ||
878 | ctx->time - sub->total_time_enabled; | ||
879 | } | ||
880 | |||
881 | /* | ||
854 | * Cross CPU call to enable a performance counter | 882 | * Cross CPU call to enable a performance counter |
855 | */ | 883 | */ |
856 | static void __perf_counter_enable(void *info) | 884 | static void __perf_counter_enable(void *info) |
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info) | |||
877 | 905 | ||
878 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | 906 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) |
879 | goto unlock; | 907 | goto unlock; |
880 | counter->state = PERF_COUNTER_STATE_INACTIVE; | 908 | __perf_counter_mark_enabled(counter, ctx); |
881 | counter->tstamp_enabled = ctx->time - counter->total_time_enabled; | ||
882 | 909 | ||
883 | /* | 910 | /* |
884 | * If the counter is in a group and isn't the group leader, | 911 | * If the counter is in a group and isn't the group leader, |
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter) | |||
971 | * Since we have the lock this context can't be scheduled | 998 | * Since we have the lock this context can't be scheduled |
972 | * in, so we can change the state safely. | 999 | * in, so we can change the state safely. |
973 | */ | 1000 | */ |
974 | if (counter->state == PERF_COUNTER_STATE_OFF) { | 1001 | if (counter->state == PERF_COUNTER_STATE_OFF) |
975 | counter->state = PERF_COUNTER_STATE_INACTIVE; | 1002 | __perf_counter_mark_enabled(counter, ctx); |
976 | counter->tstamp_enabled = | 1003 | |
977 | ctx->time - counter->total_time_enabled; | ||
978 | } | ||
979 | out: | 1004 | out: |
980 | spin_unlock_irq(&ctx->lock); | 1005 | spin_unlock_irq(&ctx->lock); |
981 | } | 1006 | } |
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) | |||
1479 | counter->attr.enable_on_exec = 0; | 1504 | counter->attr.enable_on_exec = 0; |
1480 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) | 1505 | if (counter->state >= PERF_COUNTER_STATE_INACTIVE) |
1481 | continue; | 1506 | continue; |
1482 | counter->state = PERF_COUNTER_STATE_INACTIVE; | 1507 | __perf_counter_mark_enabled(counter, ctx); |
1483 | counter->tstamp_enabled = | ||
1484 | ctx->time - counter->total_time_enabled; | ||
1485 | enabled = 1; | 1508 | enabled = 1; |
1486 | } | 1509 | } |
1487 | 1510 | ||
@@ -1503,10 +1526,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task) | |||
1503 | */ | 1526 | */ |
1504 | static void __perf_counter_read(void *info) | 1527 | static void __perf_counter_read(void *info) |
1505 | { | 1528 | { |
1529 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1506 | struct perf_counter *counter = info; | 1530 | struct perf_counter *counter = info; |
1507 | struct perf_counter_context *ctx = counter->ctx; | 1531 | struct perf_counter_context *ctx = counter->ctx; |
1508 | unsigned long flags; | 1532 | unsigned long flags; |
1509 | 1533 | ||
1534 | /* | ||
1535 | * If this is a task context, we need to check whether it is | ||
1536 | * the current task context of this cpu. If not it has been | ||
1537 | * scheduled out before the smp call arrived. In that case | ||
1538 | * counter->count would have been updated to a recent sample | ||
1539 | * when the counter was scheduled out. | ||
1540 | */ | ||
1541 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
1542 | return; | ||
1543 | |||
1510 | local_irq_save(flags); | 1544 | local_irq_save(flags); |
1511 | if (ctx->is_active) | 1545 | if (ctx->is_active) |
1512 | update_context_time(ctx); | 1546 | update_context_time(ctx); |
@@ -1664,6 +1698,11 @@ static void free_counter(struct perf_counter *counter) | |||
1664 | atomic_dec(&nr_task_counters); | 1698 | atomic_dec(&nr_task_counters); |
1665 | } | 1699 | } |
1666 | 1700 | ||
1701 | if (counter->output) { | ||
1702 | fput(counter->output->filp); | ||
1703 | counter->output = NULL; | ||
1704 | } | ||
1705 | |||
1667 | if (counter->destroy) | 1706 | if (counter->destroy) |
1668 | counter->destroy(counter); | 1707 | counter->destroy(counter); |
1669 | 1708 | ||
@@ -1780,7 +1819,7 @@ static int perf_counter_read_group(struct perf_counter *counter, | |||
1780 | size += err; | 1819 | size += err; |
1781 | 1820 | ||
1782 | list_for_each_entry(sub, &leader->sibling_list, list_entry) { | 1821 | list_for_each_entry(sub, &leader->sibling_list, list_entry) { |
1783 | err = perf_counter_read_entry(counter, read_format, | 1822 | err = perf_counter_read_entry(sub, read_format, |
1784 | buf + size); | 1823 | buf + size); |
1785 | if (err < 0) | 1824 | if (err < 0) |
1786 | return err; | 1825 | return err; |
@@ -1949,6 +1988,8 @@ unlock: | |||
1949 | return ret; | 1988 | return ret; |
1950 | } | 1989 | } |
1951 | 1990 | ||
1991 | int perf_counter_set_output(struct perf_counter *counter, int output_fd); | ||
1992 | |||
1952 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 1993 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
1953 | { | 1994 | { |
1954 | struct perf_counter *counter = file->private_data; | 1995 | struct perf_counter *counter = file->private_data; |
@@ -1972,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
1972 | case PERF_COUNTER_IOC_PERIOD: | 2013 | case PERF_COUNTER_IOC_PERIOD: |
1973 | return perf_counter_period(counter, (u64 __user *)arg); | 2014 | return perf_counter_period(counter, (u64 __user *)arg); |
1974 | 2015 | ||
2016 | case PERF_COUNTER_IOC_SET_OUTPUT: | ||
2017 | return perf_counter_set_output(counter, arg); | ||
2018 | |||
1975 | default: | 2019 | default: |
1976 | return -ENOTTY; | 2020 | return -ENOTTY; |
1977 | } | 2021 | } |
@@ -2008,6 +2052,10 @@ int perf_counter_task_disable(void) | |||
2008 | return 0; | 2052 | return 0; |
2009 | } | 2053 | } |
2010 | 2054 | ||
2055 | #ifndef PERF_COUNTER_INDEX_OFFSET | ||
2056 | # define PERF_COUNTER_INDEX_OFFSET 0 | ||
2057 | #endif | ||
2058 | |||
2011 | static int perf_counter_index(struct perf_counter *counter) | 2059 | static int perf_counter_index(struct perf_counter *counter) |
2012 | { | 2060 | { |
2013 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) | 2061 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) |
@@ -2238,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2238 | 2286 | ||
2239 | WARN_ON_ONCE(counter->ctx->parent_ctx); | 2287 | WARN_ON_ONCE(counter->ctx->parent_ctx); |
2240 | mutex_lock(&counter->mmap_mutex); | 2288 | mutex_lock(&counter->mmap_mutex); |
2289 | if (counter->output) { | ||
2290 | ret = -EINVAL; | ||
2291 | goto unlock; | ||
2292 | } | ||
2293 | |||
2241 | if (atomic_inc_not_zero(&counter->mmap_count)) { | 2294 | if (atomic_inc_not_zero(&counter->mmap_count)) { |
2242 | if (nr_pages != counter->data->nr_pages) | 2295 | if (nr_pages != counter->data->nr_pages) |
2243 | ret = -EINVAL; | 2296 | ret = -EINVAL; |
@@ -2623,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
2623 | struct perf_counter *counter, unsigned int size, | 2676 | struct perf_counter *counter, unsigned int size, |
2624 | int nmi, int sample) | 2677 | int nmi, int sample) |
2625 | { | 2678 | { |
2679 | struct perf_counter *output_counter; | ||
2626 | struct perf_mmap_data *data; | 2680 | struct perf_mmap_data *data; |
2627 | unsigned int offset, head; | 2681 | unsigned int offset, head; |
2628 | int have_lost; | 2682 | int have_lost; |
@@ -2632,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
2632 | u64 lost; | 2686 | u64 lost; |
2633 | } lost_event; | 2687 | } lost_event; |
2634 | 2688 | ||
2689 | rcu_read_lock(); | ||
2635 | /* | 2690 | /* |
2636 | * For inherited counters we send all the output towards the parent. | 2691 | * For inherited counters we send all the output towards the parent. |
2637 | */ | 2692 | */ |
2638 | if (counter->parent) | 2693 | if (counter->parent) |
2639 | counter = counter->parent; | 2694 | counter = counter->parent; |
2640 | 2695 | ||
2641 | rcu_read_lock(); | 2696 | output_counter = rcu_dereference(counter->output); |
2697 | if (output_counter) | ||
2698 | counter = output_counter; | ||
2699 | |||
2642 | data = rcu_dereference(counter->data); | 2700 | data = rcu_dereference(counter->data); |
2643 | if (!data) | 2701 | if (!data) |
2644 | goto out; | 2702 | goto out; |
@@ -3919,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) | |||
3919 | * have these. | 3977 | * have these. |
3920 | */ | 3978 | */ |
3921 | if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && | 3979 | if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && |
3980 | perf_paranoid_tracepoint_raw() && | ||
3922 | !capable(CAP_SYS_ADMIN)) | 3981 | !capable(CAP_SYS_ADMIN)) |
3923 | return ERR_PTR(-EPERM); | 3982 | return ERR_PTR(-EPERM); |
3924 | 3983 | ||
@@ -4051,6 +4110,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, | |||
4051 | hwc->sample_period = attr->sample_period; | 4110 | hwc->sample_period = attr->sample_period; |
4052 | if (attr->freq && attr->sample_freq) | 4111 | if (attr->freq && attr->sample_freq) |
4053 | hwc->sample_period = 1; | 4112 | hwc->sample_period = 1; |
4113 | hwc->last_period = hwc->sample_period; | ||
4054 | 4114 | ||
4055 | atomic64_set(&hwc->period_left, hwc->sample_period); | 4115 | atomic64_set(&hwc->period_left, hwc->sample_period); |
4056 | 4116 | ||
@@ -4155,6 +4215,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr, | |||
4155 | if (val) | 4215 | if (val) |
4156 | goto err_size; | 4216 | goto err_size; |
4157 | } | 4217 | } |
4218 | size = sizeof(*attr); | ||
4158 | } | 4219 | } |
4159 | 4220 | ||
4160 | ret = copy_from_user(attr, uattr, size); | 4221 | ret = copy_from_user(attr, uattr, size); |
@@ -4186,6 +4247,57 @@ err_size: | |||
4186 | goto out; | 4247 | goto out; |
4187 | } | 4248 | } |
4188 | 4249 | ||
4250 | int perf_counter_set_output(struct perf_counter *counter, int output_fd) | ||
4251 | { | ||
4252 | struct perf_counter *output_counter = NULL; | ||
4253 | struct file *output_file = NULL; | ||
4254 | struct perf_counter *old_output; | ||
4255 | int fput_needed = 0; | ||
4256 | int ret = -EINVAL; | ||
4257 | |||
4258 | if (!output_fd) | ||
4259 | goto set; | ||
4260 | |||
4261 | output_file = fget_light(output_fd, &fput_needed); | ||
4262 | if (!output_file) | ||
4263 | return -EBADF; | ||
4264 | |||
4265 | if (output_file->f_op != &perf_fops) | ||
4266 | goto out; | ||
4267 | |||
4268 | output_counter = output_file->private_data; | ||
4269 | |||
4270 | /* Don't chain output fds */ | ||
4271 | if (output_counter->output) | ||
4272 | goto out; | ||
4273 | |||
4274 | /* Don't set an output fd when we already have an output channel */ | ||
4275 | if (counter->data) | ||
4276 | goto out; | ||
4277 | |||
4278 | atomic_long_inc(&output_file->f_count); | ||
4279 | |||
4280 | set: | ||
4281 | mutex_lock(&counter->mmap_mutex); | ||
4282 | old_output = counter->output; | ||
4283 | rcu_assign_pointer(counter->output, output_counter); | ||
4284 | mutex_unlock(&counter->mmap_mutex); | ||
4285 | |||
4286 | if (old_output) { | ||
4287 | /* | ||
4288 | * we need to make sure no existing perf_output_*() | ||
4289 | * is still referencing this counter. | ||
4290 | */ | ||
4291 | synchronize_rcu(); | ||
4292 | fput(old_output->filp); | ||
4293 | } | ||
4294 | |||
4295 | ret = 0; | ||
4296 | out: | ||
4297 | fput_light(output_file, fput_needed); | ||
4298 | return ret; | ||
4299 | } | ||
4300 | |||
4189 | /** | 4301 | /** |
4190 | * sys_perf_counter_open - open a performance counter, associate it to a task/cpu | 4302 | * sys_perf_counter_open - open a performance counter, associate it to a task/cpu |
4191 | * | 4303 | * |
@@ -4205,15 +4317,15 @@ SYSCALL_DEFINE5(perf_counter_open, | |||
4205 | struct file *group_file = NULL; | 4317 | struct file *group_file = NULL; |
4206 | int fput_needed = 0; | 4318 | int fput_needed = 0; |
4207 | int fput_needed2 = 0; | 4319 | int fput_needed2 = 0; |
4208 | int ret; | 4320 | int err; |
4209 | 4321 | ||
4210 | /* for future expandability... */ | 4322 | /* for future expandability... */ |
4211 | if (flags) | 4323 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) |
4212 | return -EINVAL; | 4324 | return -EINVAL; |
4213 | 4325 | ||
4214 | ret = perf_copy_attr(attr_uptr, &attr); | 4326 | err = perf_copy_attr(attr_uptr, &attr); |
4215 | if (ret) | 4327 | if (err) |
4216 | return ret; | 4328 | return err; |
4217 | 4329 | ||
4218 | if (!attr.exclude_kernel) { | 4330 | if (!attr.exclude_kernel) { |
4219 | if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 4331 | if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) |
@@ -4236,8 +4348,8 @@ SYSCALL_DEFINE5(perf_counter_open, | |||
4236 | * Look up the group leader (we will attach this counter to it): | 4348 | * Look up the group leader (we will attach this counter to it): |
4237 | */ | 4349 | */ |
4238 | group_leader = NULL; | 4350 | group_leader = NULL; |
4239 | if (group_fd != -1) { | 4351 | if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { |
4240 | ret = -EINVAL; | 4352 | err = -EINVAL; |
4241 | group_file = fget_light(group_fd, &fput_needed); | 4353 | group_file = fget_light(group_fd, &fput_needed); |
4242 | if (!group_file) | 4354 | if (!group_file) |
4243 | goto err_put_context; | 4355 | goto err_put_context; |
@@ -4266,18 +4378,24 @@ SYSCALL_DEFINE5(perf_counter_open, | |||
4266 | 4378 | ||
4267 | counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, | 4379 | counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, |
4268 | NULL, GFP_KERNEL); | 4380 | NULL, GFP_KERNEL); |
4269 | ret = PTR_ERR(counter); | 4381 | err = PTR_ERR(counter); |
4270 | if (IS_ERR(counter)) | 4382 | if (IS_ERR(counter)) |
4271 | goto err_put_context; | 4383 | goto err_put_context; |
4272 | 4384 | ||
4273 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | 4385 | err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); |
4274 | if (ret < 0) | 4386 | if (err < 0) |
4275 | goto err_free_put_context; | 4387 | goto err_free_put_context; |
4276 | 4388 | ||
4277 | counter_file = fget_light(ret, &fput_needed2); | 4389 | counter_file = fget_light(err, &fput_needed2); |
4278 | if (!counter_file) | 4390 | if (!counter_file) |
4279 | goto err_free_put_context; | 4391 | goto err_free_put_context; |
4280 | 4392 | ||
4393 | if (flags & PERF_FLAG_FD_OUTPUT) { | ||
4394 | err = perf_counter_set_output(counter, group_fd); | ||
4395 | if (err) | ||
4396 | goto err_fput_free_put_context; | ||
4397 | } | ||
4398 | |||
4281 | counter->filp = counter_file; | 4399 | counter->filp = counter_file; |
4282 | WARN_ON_ONCE(ctx->parent_ctx); | 4400 | WARN_ON_ONCE(ctx->parent_ctx); |
4283 | mutex_lock(&ctx->mutex); | 4401 | mutex_lock(&ctx->mutex); |
@@ -4291,20 +4409,20 @@ SYSCALL_DEFINE5(perf_counter_open, | |||
4291 | list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); | 4409 | list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); |
4292 | mutex_unlock(¤t->perf_counter_mutex); | 4410 | mutex_unlock(¤t->perf_counter_mutex); |
4293 | 4411 | ||
4412 | err_fput_free_put_context: | ||
4294 | fput_light(counter_file, fput_needed2); | 4413 | fput_light(counter_file, fput_needed2); |
4295 | 4414 | ||
4296 | out_fput: | ||
4297 | fput_light(group_file, fput_needed); | ||
4298 | |||
4299 | return ret; | ||
4300 | |||
4301 | err_free_put_context: | 4415 | err_free_put_context: |
4302 | kfree(counter); | 4416 | if (err < 0) |
4417 | kfree(counter); | ||
4303 | 4418 | ||
4304 | err_put_context: | 4419 | err_put_context: |
4305 | put_ctx(ctx); | 4420 | if (err < 0) |
4421 | put_ctx(ctx); | ||
4422 | |||
4423 | fput_light(group_file, fput_needed); | ||
4306 | 4424 | ||
4307 | goto out_fput; | 4425 | return err; |
4308 | } | 4426 | } |
4309 | 4427 | ||
4310 | /* | 4428 | /* |