aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c194
1 files changed, 156 insertions, 38 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b0bdb36ccfc8..8cb94a52d1bb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,11 +46,17 @@ static atomic_t nr_task_counters __read_mostly;
46 46
47/* 47/*
48 * perf counter paranoia level: 48 * perf counter paranoia level:
49 * 0 - not paranoid 49 * -1 - not paranoid at all
50 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
51 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
52 */ 53 */
53int sysctl_perf_counter_paranoid __read_mostly; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
54 60
55static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
56{ 62{
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
469 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end; 476 u64 run_end;
471 477
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
473 return; 480 return;
474 481
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
518 */ 525 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx); 527 update_context_time(ctx);
521 update_counter_times(counter); 528 update_group_times(counter);
522 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
524 else 531 else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
573 * in, so we can change the state safely. 580 * in, so we can change the state safely.
574 */ 581 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter); 583 update_group_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
578 } 585 }
579 586
@@ -851,6 +858,27 @@ retry:
851} 858}
852 859
853/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
854 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
855 */ 883 */
856static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
877 905
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock; 907 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882 909
883 /* 910 /*
884 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
971 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely. 999 * in, so we can change the state safely.
973 */ 1000 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
975 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
976 counter->tstamp_enabled = 1003
977 ctx->time - counter->total_time_enabled;
978 }
979 out: 1004 out:
980 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
981} 1006}
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1479 counter->attr.enable_on_exec = 0; 1504 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue; 1506 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE; 1507 __perf_counter_mark_enabled(counter, ctx);
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1; 1508 enabled = 1;
1486 } 1509 }
1487 1510
@@ -1503,10 +1526,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1503 */ 1526 */
1504static void __perf_counter_read(void *info) 1527static void __perf_counter_read(void *info)
1505{ 1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1506 struct perf_counter *counter = info; 1530 struct perf_counter *counter = info;
1507 struct perf_counter_context *ctx = counter->ctx; 1531 struct perf_counter_context *ctx = counter->ctx;
1508 unsigned long flags; 1532 unsigned long flags;
1509 1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1510 local_irq_save(flags); 1544 local_irq_save(flags);
1511 if (ctx->is_active) 1545 if (ctx->is_active)
1512 update_context_time(ctx); 1546 update_context_time(ctx);
@@ -1664,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
1664 atomic_dec(&nr_task_counters); 1698 atomic_dec(&nr_task_counters);
1665 } 1699 }
1666 1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1667 if (counter->destroy) 1706 if (counter->destroy)
1668 counter->destroy(counter); 1707 counter->destroy(counter);
1669 1708
@@ -1780,7 +1819,7 @@ static int perf_counter_read_group(struct perf_counter *counter,
1780 size += err; 1819 size += err;
1781 1820
1782 list_for_each_entry(sub, &leader->sibling_list, list_entry) { 1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1783 err = perf_counter_read_entry(counter, read_format, 1822 err = perf_counter_read_entry(sub, read_format,
1784 buf + size); 1823 buf + size);
1785 if (err < 0) 1824 if (err < 0)
1786 return err; 1825 return err;
@@ -1949,6 +1988,8 @@ unlock:
1949 return ret; 1988 return ret;
1950} 1989}
1951 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1952static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1953{ 1994{
1954 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1972,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1972 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1973 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1974 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1975 default: 2019 default:
1976 return -ENOTTY; 2020 return -ENOTTY;
1977 } 2021 }
@@ -2008,6 +2052,10 @@ int perf_counter_task_disable(void)
2008 return 0; 2052 return 0;
2009} 2053}
2010 2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
2011static int perf_counter_index(struct perf_counter *counter) 2059static int perf_counter_index(struct perf_counter *counter)
2012{ 2060{
2013 if (counter->state != PERF_COUNTER_STATE_ACTIVE) 2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@ -2238,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2238 2286
2239 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2240 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2241 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2242 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
2243 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -2623,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
2623 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2624 int nmi, int sample) 2677 int nmi, int sample)
2625{ 2678{
2679 struct perf_counter *output_counter;
2626 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2627 unsigned int offset, head; 2681 unsigned int offset, head;
2628 int have_lost; 2682 int have_lost;
@@ -2632,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
2632 u64 lost; 2686 u64 lost;
2633 } lost_event; 2687 } lost_event;
2634 2688
2689 rcu_read_lock();
2635 /* 2690 /*
2636 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2637 */ 2692 */
2638 if (counter->parent) 2693 if (counter->parent)
2639 counter = counter->parent; 2694 counter = counter->parent;
2640 2695
2641 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2642 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2643 if (!data) 2701 if (!data)
2644 goto out; 2702 goto out;
@@ -3919,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3919 * have these. 3977 * have these.
3920 */ 3978 */
3921 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && 3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3922 !capable(CAP_SYS_ADMIN)) 3981 !capable(CAP_SYS_ADMIN))
3923 return ERR_PTR(-EPERM); 3982 return ERR_PTR(-EPERM);
3924 3983
@@ -4051,6 +4110,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
4051 hwc->sample_period = attr->sample_period; 4110 hwc->sample_period = attr->sample_period;
4052 if (attr->freq && attr->sample_freq) 4111 if (attr->freq && attr->sample_freq)
4053 hwc->sample_period = 1; 4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
4054 4114
4055 atomic64_set(&hwc->period_left, hwc->sample_period); 4115 atomic64_set(&hwc->period_left, hwc->sample_period);
4056 4116
@@ -4155,6 +4215,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4155 if (val) 4215 if (val)
4156 goto err_size; 4216 goto err_size;
4157 } 4217 }
4218 size = sizeof(*attr);
4158 } 4219 }
4159 4220
4160 ret = copy_from_user(attr, uattr, size); 4221 ret = copy_from_user(attr, uattr, size);
@@ -4186,6 +4247,57 @@ err_size:
4186 goto out; 4247 goto out;
4187} 4248}
4188 4249
4250int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4251{
4252 struct perf_counter *output_counter = NULL;
4253 struct file *output_file = NULL;
4254 struct perf_counter *old_output;
4255 int fput_needed = 0;
4256 int ret = -EINVAL;
4257
4258 if (!output_fd)
4259 goto set;
4260
4261 output_file = fget_light(output_fd, &fput_needed);
4262 if (!output_file)
4263 return -EBADF;
4264
4265 if (output_file->f_op != &perf_fops)
4266 goto out;
4267
4268 output_counter = output_file->private_data;
4269
4270 /* Don't chain output fds */
4271 if (output_counter->output)
4272 goto out;
4273
4274 /* Don't set an output fd when we already have an output channel */
4275 if (counter->data)
4276 goto out;
4277
4278 atomic_long_inc(&output_file->f_count);
4279
4280set:
4281 mutex_lock(&counter->mmap_mutex);
4282 old_output = counter->output;
4283 rcu_assign_pointer(counter->output, output_counter);
4284 mutex_unlock(&counter->mmap_mutex);
4285
4286 if (old_output) {
4287 /*
4288 * we need to make sure no existing perf_output_*()
4289 * is still referencing this counter.
4290 */
4291 synchronize_rcu();
4292 fput(old_output->filp);
4293 }
4294
4295 ret = 0;
4296out:
4297 fput_light(output_file, fput_needed);
4298 return ret;
4299}
4300
4189/** 4301/**
4190 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4302 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4191 * 4303 *
@@ -4205,15 +4317,15 @@ SYSCALL_DEFINE5(perf_counter_open,
4205 struct file *group_file = NULL; 4317 struct file *group_file = NULL;
4206 int fput_needed = 0; 4318 int fput_needed = 0;
4207 int fput_needed2 = 0; 4319 int fput_needed2 = 0;
4208 int ret; 4320 int err;
4209 4321
4210 /* for future expandability... */ 4322 /* for future expandability... */
4211 if (flags) 4323 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4212 return -EINVAL; 4324 return -EINVAL;
4213 4325
4214 ret = perf_copy_attr(attr_uptr, &attr); 4326 err = perf_copy_attr(attr_uptr, &attr);
4215 if (ret) 4327 if (err)
4216 return ret; 4328 return err;
4217 4329
4218 if (!attr.exclude_kernel) { 4330 if (!attr.exclude_kernel) {
4219 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4331 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4236,8 +4348,8 @@ SYSCALL_DEFINE5(perf_counter_open,
4236 * Look up the group leader (we will attach this counter to it): 4348 * Look up the group leader (we will attach this counter to it):
4237 */ 4349 */
4238 group_leader = NULL; 4350 group_leader = NULL;
4239 if (group_fd != -1) { 4351 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4240 ret = -EINVAL; 4352 err = -EINVAL;
4241 group_file = fget_light(group_fd, &fput_needed); 4353 group_file = fget_light(group_fd, &fput_needed);
4242 if (!group_file) 4354 if (!group_file)
4243 goto err_put_context; 4355 goto err_put_context;
@@ -4266,18 +4378,24 @@ SYSCALL_DEFINE5(perf_counter_open,
4266 4378
4267 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4379 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4268 NULL, GFP_KERNEL); 4380 NULL, GFP_KERNEL);
4269 ret = PTR_ERR(counter); 4381 err = PTR_ERR(counter);
4270 if (IS_ERR(counter)) 4382 if (IS_ERR(counter))
4271 goto err_put_context; 4383 goto err_put_context;
4272 4384
4273 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4385 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4274 if (ret < 0) 4386 if (err < 0)
4275 goto err_free_put_context; 4387 goto err_free_put_context;
4276 4388
4277 counter_file = fget_light(ret, &fput_needed2); 4389 counter_file = fget_light(err, &fput_needed2);
4278 if (!counter_file) 4390 if (!counter_file)
4279 goto err_free_put_context; 4391 goto err_free_put_context;
4280 4392
4393 if (flags & PERF_FLAG_FD_OUTPUT) {
4394 err = perf_counter_set_output(counter, group_fd);
4395 if (err)
4396 goto err_fput_free_put_context;
4397 }
4398
4281 counter->filp = counter_file; 4399 counter->filp = counter_file;
4282 WARN_ON_ONCE(ctx->parent_ctx); 4400 WARN_ON_ONCE(ctx->parent_ctx);
4283 mutex_lock(&ctx->mutex); 4401 mutex_lock(&ctx->mutex);
@@ -4291,20 +4409,20 @@ SYSCALL_DEFINE5(perf_counter_open,
4291 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4409 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4292 mutex_unlock(&current->perf_counter_mutex); 4410 mutex_unlock(&current->perf_counter_mutex);
4293 4411
4412err_fput_free_put_context:
4294 fput_light(counter_file, fput_needed2); 4413 fput_light(counter_file, fput_needed2);
4295 4414
4296out_fput:
4297 fput_light(group_file, fput_needed);
4298
4299 return ret;
4300
4301err_free_put_context: 4415err_free_put_context:
4302 kfree(counter); 4416 if (err < 0)
4417 kfree(counter);
4303 4418
4304err_put_context: 4419err_put_context:
4305 put_ctx(ctx); 4420 if (err < 0)
4421 put_ctx(ctx);
4422
4423 fput_light(group_file, fput_needed);
4306 4424
4307 goto out_fput; 4425 return err;
4308} 4426}
4309 4427
4310/* 4428/*