diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-03-23 13:22:10 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-04-06 03:30:27 -0400 |
commit | 7b732a75047738e4f85438ed2f9cd34bf5f2a19a (patch) | |
tree | bae36de785ac819ceef6fa5e1b7884a4a421cc3c | |
parent | b09d2501ed3d294619cbfbcf828ad39324d0e548 (diff) |
perf_counter: new output ABI - part 1
Impact: Rework the perfcounter output ABI
use sys_read() only for instant data and provide mmap() output for all
async overflow data.
The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.
In case of 0 extra pages there is no data output and the first page
only contains meta data.
When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.
Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.
Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | arch/powerpc/kernel/perf_counter.c | 9 | ||||
-rw-r--r-- | include/linux/perf_counter.h | 36 | ||||
-rw-r--r-- | kernel/perf_counter.c | 464 |
3 files changed, 263 insertions, 246 deletions
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index e4349281b07d..d48596ab6557 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c | |||
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable) | |||
417 | atomic64_set(&counter->hw.prev_count, val); | 417 | atomic64_set(&counter->hw.prev_count, val); |
418 | counter->hw.idx = hwc_index[i] + 1; | 418 | counter->hw.idx = hwc_index[i] + 1; |
419 | write_pmc(counter->hw.idx, val); | 419 | write_pmc(counter->hw.idx, val); |
420 | if (counter->user_page) | 420 | perf_counter_update_userpage(counter); |
421 | perf_counter_update_userpage(counter); | ||
422 | } | 421 | } |
423 | mb(); | 422 | mb(); |
424 | cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; | 423 | cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; |
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter) | |||
574 | ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); | 573 | ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); |
575 | write_pmc(counter->hw.idx, 0); | 574 | write_pmc(counter->hw.idx, 0); |
576 | counter->hw.idx = 0; | 575 | counter->hw.idx = 0; |
577 | if (counter->user_page) | 576 | perf_counter_update_userpage(counter); |
578 | perf_counter_update_userpage(counter); | ||
579 | break; | 577 | break; |
580 | } | 578 | } |
581 | } | 579 | } |
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val, | |||
702 | write_pmc(counter->hw.idx, val); | 700 | write_pmc(counter->hw.idx, val); |
703 | atomic64_set(&counter->hw.prev_count, val); | 701 | atomic64_set(&counter->hw.prev_count, val); |
704 | atomic64_set(&counter->hw.period_left, left); | 702 | atomic64_set(&counter->hw.period_left, left); |
705 | if (counter->user_page) | 703 | perf_counter_update_userpage(counter); |
706 | perf_counter_update_userpage(counter); | ||
707 | 704 | ||
708 | /* | 705 | /* |
709 | * Finally record data if requested. | 706 | * Finally record data if requested. |
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40b324e91bf6..2b5e66d5ebdf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h | |||
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page { | |||
152 | __u32 lock; /* seqlock for synchronization */ | 152 | __u32 lock; /* seqlock for synchronization */ |
153 | __u32 index; /* hardware counter identifier */ | 153 | __u32 index; /* hardware counter identifier */ |
154 | __s64 offset; /* add to hardware counter value */ | 154 | __s64 offset; /* add to hardware counter value */ |
155 | |||
156 | __u32 data_head; /* head in the data section */ | ||
155 | }; | 157 | }; |
156 | 158 | ||
157 | #ifdef __KERNEL__ | 159 | #ifdef __KERNEL__ |
@@ -218,21 +220,6 @@ struct hw_perf_counter { | |||
218 | #endif | 220 | #endif |
219 | }; | 221 | }; |
220 | 222 | ||
221 | /* | ||
222 | * Hardcoded buffer length limit for now, for IRQ-fed events: | ||
223 | */ | ||
224 | #define PERF_DATA_BUFLEN 2048 | ||
225 | |||
226 | /** | ||
227 | * struct perf_data - performance counter IRQ data sampling ... | ||
228 | */ | ||
229 | struct perf_data { | ||
230 | int len; | ||
231 | int rd_idx; | ||
232 | int overrun; | ||
233 | u8 data[PERF_DATA_BUFLEN]; | ||
234 | }; | ||
235 | |||
236 | struct perf_counter; | 223 | struct perf_counter; |
237 | 224 | ||
238 | /** | 225 | /** |
@@ -256,6 +243,14 @@ enum perf_counter_active_state { | |||
256 | 243 | ||
257 | struct file; | 244 | struct file; |
258 | 245 | ||
246 | struct perf_mmap_data { | ||
247 | struct rcu_head rcu_head; | ||
248 | int nr_pages; | ||
249 | atomic_t head; | ||
250 | struct perf_counter_mmap_page *user_page; | ||
251 | void *data_pages[0]; | ||
252 | }; | ||
253 | |||
259 | /** | 254 | /** |
260 | * struct perf_counter - performance counter kernel representation: | 255 | * struct perf_counter - performance counter kernel representation: |
261 | */ | 256 | */ |
@@ -289,16 +284,15 @@ struct perf_counter { | |||
289 | int oncpu; | 284 | int oncpu; |
290 | int cpu; | 285 | int cpu; |
291 | 286 | ||
292 | /* pointer to page shared with userspace via mmap */ | 287 | /* mmap bits */ |
293 | unsigned long user_page; | 288 | struct mutex mmap_mutex; |
289 | atomic_t mmap_count; | ||
290 | struct perf_mmap_data *data; | ||
294 | 291 | ||
295 | /* read() / irq related data */ | 292 | /* poll related */ |
296 | wait_queue_head_t waitq; | 293 | wait_queue_head_t waitq; |
297 | /* optional: for NMIs */ | 294 | /* optional: for NMIs */ |
298 | int wakeup_pending; | 295 | int wakeup_pending; |
299 | struct perf_data *irqdata; | ||
300 | struct perf_data *usrdata; | ||
301 | struct perf_data data[2]; | ||
302 | 296 | ||
303 | void (*destroy)(struct perf_counter *); | 297 | void (*destroy)(struct perf_counter *); |
304 | struct rcu_head rcu_head; | 298 | struct rcu_head rcu_head; |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d9cfd902140e..0dfe91094fd1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -4,7 +4,8 @@ | |||
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | 5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar |
6 | * | 6 | * |
7 | * For licencing details see kernel-base/COPYING | 7 | * |
8 | * For licensing details see kernel-base/COPYING | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter) | |||
1022 | return atomic64_read(&counter->count); | 1023 | return atomic64_read(&counter->count); |
1023 | } | 1024 | } |
1024 | 1025 | ||
1025 | /* | ||
1026 | * Cross CPU call to switch performance data pointers | ||
1027 | */ | ||
1028 | static void __perf_switch_irq_data(void *info) | ||
1029 | { | ||
1030 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1031 | struct perf_counter *counter = info; | ||
1032 | struct perf_counter_context *ctx = counter->ctx; | ||
1033 | struct perf_data *oldirqdata = counter->irqdata; | ||
1034 | |||
1035 | /* | ||
1036 | * If this is a task context, we need to check whether it is | ||
1037 | * the current task context of this cpu. If not it has been | ||
1038 | * scheduled out before the smp call arrived. | ||
1039 | */ | ||
1040 | if (ctx->task) { | ||
1041 | if (cpuctx->task_ctx != ctx) | ||
1042 | return; | ||
1043 | spin_lock(&ctx->lock); | ||
1044 | } | ||
1045 | |||
1046 | /* Change the pointer NMI safe */ | ||
1047 | atomic_long_set((atomic_long_t *)&counter->irqdata, | ||
1048 | (unsigned long) counter->usrdata); | ||
1049 | counter->usrdata = oldirqdata; | ||
1050 | |||
1051 | if (ctx->task) | ||
1052 | spin_unlock(&ctx->lock); | ||
1053 | } | ||
1054 | |||
1055 | static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) | ||
1056 | { | ||
1057 | struct perf_counter_context *ctx = counter->ctx; | ||
1058 | struct perf_data *oldirqdata = counter->irqdata; | ||
1059 | struct task_struct *task = ctx->task; | ||
1060 | |||
1061 | if (!task) { | ||
1062 | smp_call_function_single(counter->cpu, | ||
1063 | __perf_switch_irq_data, | ||
1064 | counter, 1); | ||
1065 | return counter->usrdata; | ||
1066 | } | ||
1067 | |||
1068 | retry: | ||
1069 | spin_lock_irq(&ctx->lock); | ||
1070 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) { | ||
1071 | counter->irqdata = counter->usrdata; | ||
1072 | counter->usrdata = oldirqdata; | ||
1073 | spin_unlock_irq(&ctx->lock); | ||
1074 | return oldirqdata; | ||
1075 | } | ||
1076 | spin_unlock_irq(&ctx->lock); | ||
1077 | task_oncpu_function_call(task, __perf_switch_irq_data, counter); | ||
1078 | /* Might have failed, because task was scheduled out */ | ||
1079 | if (counter->irqdata == oldirqdata) | ||
1080 | goto retry; | ||
1081 | |||
1082 | return counter->usrdata; | ||
1083 | } | ||
1084 | |||
1085 | static void put_context(struct perf_counter_context *ctx) | 1026 | static void put_context(struct perf_counter_context *ctx) |
1086 | { | 1027 | { |
1087 | if (ctx->task) | 1028 | if (ctx->task) |
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file) | |||
1177 | mutex_unlock(&counter->mutex); | 1118 | mutex_unlock(&counter->mutex); |
1178 | mutex_unlock(&ctx->mutex); | 1119 | mutex_unlock(&ctx->mutex); |
1179 | 1120 | ||
1180 | free_page(counter->user_page); | ||
1181 | free_counter(counter); | 1121 | free_counter(counter); |
1182 | put_context(ctx); | 1122 | put_context(ctx); |
1183 | 1123 | ||
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | |||
1192 | { | 1132 | { |
1193 | u64 cntval; | 1133 | u64 cntval; |
1194 | 1134 | ||
1195 | if (count != sizeof(cntval)) | 1135 | if (count < sizeof(cntval)) |
1196 | return -EINVAL; | 1136 | return -EINVAL; |
1197 | 1137 | ||
1198 | /* | 1138 | /* |
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | |||
1211 | } | 1151 | } |
1212 | 1152 | ||
1213 | static ssize_t | 1153 | static ssize_t |
1214 | perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) | ||
1215 | { | ||
1216 | if (!usrdata->len) | ||
1217 | return 0; | ||
1218 | |||
1219 | count = min(count, (size_t)usrdata->len); | ||
1220 | if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) | ||
1221 | return -EFAULT; | ||
1222 | |||
1223 | /* Adjust the counters */ | ||
1224 | usrdata->len -= count; | ||
1225 | if (!usrdata->len) | ||
1226 | usrdata->rd_idx = 0; | ||
1227 | else | ||
1228 | usrdata->rd_idx += count; | ||
1229 | |||
1230 | return count; | ||
1231 | } | ||
1232 | |||
1233 | static ssize_t | ||
1234 | perf_read_irq_data(struct perf_counter *counter, | ||
1235 | char __user *buf, | ||
1236 | size_t count, | ||
1237 | int nonblocking) | ||
1238 | { | ||
1239 | struct perf_data *irqdata, *usrdata; | ||
1240 | DECLARE_WAITQUEUE(wait, current); | ||
1241 | ssize_t res, res2; | ||
1242 | |||
1243 | irqdata = counter->irqdata; | ||
1244 | usrdata = counter->usrdata; | ||
1245 | |||
1246 | if (usrdata->len + irqdata->len >= count) | ||
1247 | goto read_pending; | ||
1248 | |||
1249 | if (nonblocking) | ||
1250 | return -EAGAIN; | ||
1251 | |||
1252 | spin_lock_irq(&counter->waitq.lock); | ||
1253 | __add_wait_queue(&counter->waitq, &wait); | ||
1254 | for (;;) { | ||
1255 | set_current_state(TASK_INTERRUPTIBLE); | ||
1256 | if (usrdata->len + irqdata->len >= count) | ||
1257 | break; | ||
1258 | |||
1259 | if (signal_pending(current)) | ||
1260 | break; | ||
1261 | |||
1262 | if (counter->state == PERF_COUNTER_STATE_ERROR) | ||
1263 | break; | ||
1264 | |||
1265 | spin_unlock_irq(&counter->waitq.lock); | ||
1266 | schedule(); | ||
1267 | spin_lock_irq(&counter->waitq.lock); | ||
1268 | } | ||
1269 | __remove_wait_queue(&counter->waitq, &wait); | ||
1270 | __set_current_state(TASK_RUNNING); | ||
1271 | spin_unlock_irq(&counter->waitq.lock); | ||
1272 | |||
1273 | if (usrdata->len + irqdata->len < count && | ||
1274 | counter->state != PERF_COUNTER_STATE_ERROR) | ||
1275 | return -ERESTARTSYS; | ||
1276 | read_pending: | ||
1277 | mutex_lock(&counter->mutex); | ||
1278 | |||
1279 | /* Drain pending data first: */ | ||
1280 | res = perf_copy_usrdata(usrdata, buf, count); | ||
1281 | if (res < 0 || res == count) | ||
1282 | goto out; | ||
1283 | |||
1284 | /* Switch irq buffer: */ | ||
1285 | usrdata = perf_switch_irq_data(counter); | ||
1286 | res2 = perf_copy_usrdata(usrdata, buf + res, count - res); | ||
1287 | if (res2 < 0) { | ||
1288 | if (!res) | ||
1289 | res = -EFAULT; | ||
1290 | } else { | ||
1291 | res += res2; | ||
1292 | } | ||
1293 | out: | ||
1294 | mutex_unlock(&counter->mutex); | ||
1295 | |||
1296 | return res; | ||
1297 | } | ||
1298 | |||
1299 | static ssize_t | ||
1300 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | 1154 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) |
1301 | { | 1155 | { |
1302 | struct perf_counter *counter = file->private_data; | 1156 | struct perf_counter *counter = file->private_data; |
1303 | 1157 | ||
1304 | switch (counter->hw_event.record_type) { | 1158 | return perf_read_hw(counter, buf, count); |
1305 | case PERF_RECORD_SIMPLE: | ||
1306 | return perf_read_hw(counter, buf, count); | ||
1307 | |||
1308 | case PERF_RECORD_IRQ: | ||
1309 | case PERF_RECORD_GROUP: | ||
1310 | return perf_read_irq_data(counter, buf, count, | ||
1311 | file->f_flags & O_NONBLOCK); | ||
1312 | } | ||
1313 | return -EINVAL; | ||
1314 | } | 1159 | } |
1315 | 1160 | ||
1316 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 1161 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
1317 | { | 1162 | { |
1318 | struct perf_counter *counter = file->private_data; | 1163 | struct perf_counter *counter = file->private_data; |
1319 | unsigned int events = 0; | 1164 | unsigned int events = POLLIN; |
1320 | unsigned long flags; | ||
1321 | 1165 | ||
1322 | poll_wait(file, &counter->waitq, wait); | 1166 | poll_wait(file, &counter->waitq, wait); |
1323 | 1167 | ||
1324 | spin_lock_irqsave(&counter->waitq.lock, flags); | ||
1325 | if (counter->usrdata->len || counter->irqdata->len) | ||
1326 | events |= POLLIN; | ||
1327 | spin_unlock_irqrestore(&counter->waitq.lock, flags); | ||
1328 | |||
1329 | return events; | 1168 | return events; |
1330 | } | 1169 | } |
1331 | 1170 | ||
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
1347 | return err; | 1186 | return err; |
1348 | } | 1187 | } |
1349 | 1188 | ||
1350 | void perf_counter_update_userpage(struct perf_counter *counter) | 1189 | static void __perf_counter_update_userpage(struct perf_counter *counter, |
1190 | struct perf_mmap_data *data) | ||
1351 | { | 1191 | { |
1352 | struct perf_counter_mmap_page *userpg; | 1192 | struct perf_counter_mmap_page *userpg = data->user_page; |
1353 | |||
1354 | if (!counter->user_page) | ||
1355 | return; | ||
1356 | userpg = (struct perf_counter_mmap_page *) counter->user_page; | ||
1357 | 1193 | ||
1194 | /* | ||
1195 | * Disable preemption so as to not let the corresponding user-space | ||
1196 | * spin too long if we get preempted. | ||
1197 | */ | ||
1198 | preempt_disable(); | ||
1358 | ++userpg->lock; | 1199 | ++userpg->lock; |
1359 | smp_wmb(); | 1200 | smp_wmb(); |
1360 | userpg->index = counter->hw.idx; | 1201 | userpg->index = counter->hw.idx; |
1361 | userpg->offset = atomic64_read(&counter->count); | 1202 | userpg->offset = atomic64_read(&counter->count); |
1362 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) | 1203 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) |
1363 | userpg->offset -= atomic64_read(&counter->hw.prev_count); | 1204 | userpg->offset -= atomic64_read(&counter->hw.prev_count); |
1205 | |||
1206 | userpg->data_head = atomic_read(&data->head); | ||
1364 | smp_wmb(); | 1207 | smp_wmb(); |
1365 | ++userpg->lock; | 1208 | ++userpg->lock; |
1209 | preempt_enable(); | ||
1210 | } | ||
1211 | |||
1212 | void perf_counter_update_userpage(struct perf_counter *counter) | ||
1213 | { | ||
1214 | struct perf_mmap_data *data; | ||
1215 | |||
1216 | rcu_read_lock(); | ||
1217 | data = rcu_dereference(counter->data); | ||
1218 | if (data) | ||
1219 | __perf_counter_update_userpage(counter, data); | ||
1220 | rcu_read_unlock(); | ||
1366 | } | 1221 | } |
1367 | 1222 | ||
1368 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1223 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1369 | { | 1224 | { |
1370 | struct perf_counter *counter = vma->vm_file->private_data; | 1225 | struct perf_counter *counter = vma->vm_file->private_data; |
1226 | struct perf_mmap_data *data; | ||
1227 | int ret = VM_FAULT_SIGBUS; | ||
1371 | 1228 | ||
1372 | if (!counter->user_page) | 1229 | rcu_read_lock(); |
1373 | return VM_FAULT_SIGBUS; | 1230 | data = rcu_dereference(counter->data); |
1231 | if (!data) | ||
1232 | goto unlock; | ||
1233 | |||
1234 | if (vmf->pgoff == 0) { | ||
1235 | vmf->page = virt_to_page(data->user_page); | ||
1236 | } else { | ||
1237 | int nr = vmf->pgoff - 1; | ||
1374 | 1238 | ||
1375 | vmf->page = virt_to_page(counter->user_page); | 1239 | if ((unsigned)nr > data->nr_pages) |
1240 | goto unlock; | ||
1241 | |||
1242 | vmf->page = virt_to_page(data->data_pages[nr]); | ||
1243 | } | ||
1376 | get_page(vmf->page); | 1244 | get_page(vmf->page); |
1245 | ret = 0; | ||
1246 | unlock: | ||
1247 | rcu_read_unlock(); | ||
1248 | |||
1249 | return ret; | ||
1250 | } | ||
1251 | |||
1252 | static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) | ||
1253 | { | ||
1254 | struct perf_mmap_data *data; | ||
1255 | unsigned long size; | ||
1256 | int i; | ||
1257 | |||
1258 | WARN_ON(atomic_read(&counter->mmap_count)); | ||
1259 | |||
1260 | size = sizeof(struct perf_mmap_data); | ||
1261 | size += nr_pages * sizeof(void *); | ||
1262 | |||
1263 | data = kzalloc(size, GFP_KERNEL); | ||
1264 | if (!data) | ||
1265 | goto fail; | ||
1266 | |||
1267 | data->user_page = (void *)get_zeroed_page(GFP_KERNEL); | ||
1268 | if (!data->user_page) | ||
1269 | goto fail_user_page; | ||
1270 | |||
1271 | for (i = 0; i < nr_pages; i++) { | ||
1272 | data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); | ||
1273 | if (!data->data_pages[i]) | ||
1274 | goto fail_data_pages; | ||
1275 | } | ||
1276 | |||
1277 | data->nr_pages = nr_pages; | ||
1278 | |||
1279 | rcu_assign_pointer(counter->data, data); | ||
1280 | |||
1377 | return 0; | 1281 | return 0; |
1282 | |||
1283 | fail_data_pages: | ||
1284 | for (i--; i >= 0; i--) | ||
1285 | free_page((unsigned long)data->data_pages[i]); | ||
1286 | |||
1287 | free_page((unsigned long)data->user_page); | ||
1288 | |||
1289 | fail_user_page: | ||
1290 | kfree(data); | ||
1291 | |||
1292 | fail: | ||
1293 | return -ENOMEM; | ||
1294 | } | ||
1295 | |||
1296 | static void __perf_mmap_data_free(struct rcu_head *rcu_head) | ||
1297 | { | ||
1298 | struct perf_mmap_data *data = container_of(rcu_head, | ||
1299 | struct perf_mmap_data, rcu_head); | ||
1300 | int i; | ||
1301 | |||
1302 | free_page((unsigned long)data->user_page); | ||
1303 | for (i = 0; i < data->nr_pages; i++) | ||
1304 | free_page((unsigned long)data->data_pages[i]); | ||
1305 | kfree(data); | ||
1306 | } | ||
1307 | |||
1308 | static void perf_mmap_data_free(struct perf_counter *counter) | ||
1309 | { | ||
1310 | struct perf_mmap_data *data = counter->data; | ||
1311 | |||
1312 | WARN_ON(atomic_read(&counter->mmap_count)); | ||
1313 | |||
1314 | rcu_assign_pointer(counter->data, NULL); | ||
1315 | call_rcu(&data->rcu_head, __perf_mmap_data_free); | ||
1316 | } | ||
1317 | |||
1318 | static void perf_mmap_open(struct vm_area_struct *vma) | ||
1319 | { | ||
1320 | struct perf_counter *counter = vma->vm_file->private_data; | ||
1321 | |||
1322 | atomic_inc(&counter->mmap_count); | ||
1323 | } | ||
1324 | |||
1325 | static void perf_mmap_close(struct vm_area_struct *vma) | ||
1326 | { | ||
1327 | struct perf_counter *counter = vma->vm_file->private_data; | ||
1328 | |||
1329 | if (atomic_dec_and_mutex_lock(&counter->mmap_count, | ||
1330 | &counter->mmap_mutex)) { | ||
1331 | perf_mmap_data_free(counter); | ||
1332 | mutex_unlock(&counter->mmap_mutex); | ||
1333 | } | ||
1378 | } | 1334 | } |
1379 | 1335 | ||
1380 | static struct vm_operations_struct perf_mmap_vmops = { | 1336 | static struct vm_operations_struct perf_mmap_vmops = { |
1337 | .open = perf_mmap_open, | ||
1338 | .close = perf_mmap_close, | ||
1381 | .fault = perf_mmap_fault, | 1339 | .fault = perf_mmap_fault, |
1382 | }; | 1340 | }; |
1383 | 1341 | ||
1384 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) | 1342 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) |
1385 | { | 1343 | { |
1386 | struct perf_counter *counter = file->private_data; | 1344 | struct perf_counter *counter = file->private_data; |
1387 | unsigned long userpg; | 1345 | unsigned long vma_size; |
1346 | unsigned long nr_pages; | ||
1347 | unsigned long locked, lock_limit; | ||
1348 | int ret = 0; | ||
1388 | 1349 | ||
1389 | if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) | 1350 | if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) |
1390 | return -EINVAL; | 1351 | return -EINVAL; |
1391 | if (vma->vm_end - vma->vm_start != PAGE_SIZE) | 1352 | |
1353 | vma_size = vma->vm_end - vma->vm_start; | ||
1354 | nr_pages = (vma_size / PAGE_SIZE) - 1; | ||
1355 | |||
1356 | if (nr_pages == 0 || !is_power_of_2(nr_pages)) | ||
1392 | return -EINVAL; | 1357 | return -EINVAL; |
1393 | 1358 | ||
1394 | /* | 1359 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) |
1395 | * For now, restrict to the case of a hardware counter | ||
1396 | * on the current task. | ||
1397 | */ | ||
1398 | if (is_software_counter(counter) || counter->task != current) | ||
1399 | return -EINVAL; | 1360 | return -EINVAL; |
1400 | 1361 | ||
1401 | userpg = counter->user_page; | 1362 | if (vma->vm_pgoff != 0) |
1402 | if (!userpg) { | 1363 | return -EINVAL; |
1403 | userpg = get_zeroed_page(GFP_KERNEL); | 1364 | |
1404 | mutex_lock(&counter->mutex); | 1365 | locked = vma_size >> PAGE_SHIFT; |
1405 | if (counter->user_page) { | 1366 | locked += vma->vm_mm->locked_vm; |
1406 | free_page(userpg); | ||
1407 | userpg = counter->user_page; | ||
1408 | } else { | ||
1409 | counter->user_page = userpg; | ||
1410 | } | ||
1411 | mutex_unlock(&counter->mutex); | ||
1412 | if (!userpg) | ||
1413 | return -ENOMEM; | ||
1414 | } | ||
1415 | 1367 | ||
1416 | perf_counter_update_userpage(counter); | 1368 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
1369 | lock_limit >>= PAGE_SHIFT; | ||
1370 | |||
1371 | if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) | ||
1372 | return -EPERM; | ||
1373 | |||
1374 | mutex_lock(&counter->mmap_mutex); | ||
1375 | if (atomic_inc_not_zero(&counter->mmap_count)) | ||
1376 | goto out; | ||
1377 | |||
1378 | WARN_ON(counter->data); | ||
1379 | ret = perf_mmap_data_alloc(counter, nr_pages); | ||
1380 | if (!ret) | ||
1381 | atomic_set(&counter->mmap_count, 1); | ||
1382 | out: | ||
1383 | mutex_unlock(&counter->mmap_mutex); | ||
1417 | 1384 | ||
1418 | vma->vm_flags &= ~VM_MAYWRITE; | 1385 | vma->vm_flags &= ~VM_MAYWRITE; |
1419 | vma->vm_flags |= VM_RESERVED; | 1386 | vma->vm_flags |= VM_RESERVED; |
1420 | vma->vm_ops = &perf_mmap_vmops; | 1387 | vma->vm_ops = &perf_mmap_vmops; |
1421 | return 0; | 1388 | |
1389 | return ret; | ||
1422 | } | 1390 | } |
1423 | 1391 | ||
1424 | static const struct file_operations perf_fops = { | 1392 | static const struct file_operations perf_fops = { |
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = { | |||
1434 | * Output | 1402 | * Output |
1435 | */ | 1403 | */ |
1436 | 1404 | ||
1437 | static void perf_counter_store_irq(struct perf_counter *counter, u64 data) | 1405 | static int perf_output_write(struct perf_counter *counter, int nmi, |
1406 | void *buf, ssize_t size) | ||
1438 | { | 1407 | { |
1439 | struct perf_data *irqdata = counter->irqdata; | 1408 | struct perf_mmap_data *data; |
1409 | unsigned int offset, head, nr; | ||
1410 | unsigned int len; | ||
1411 | int ret, wakeup; | ||
1440 | 1412 | ||
1441 | if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { | 1413 | rcu_read_lock(); |
1442 | irqdata->overrun++; | 1414 | ret = -ENOSPC; |
1443 | } else { | 1415 | data = rcu_dereference(counter->data); |
1444 | u64 *p = (u64 *) &irqdata->data[irqdata->len]; | 1416 | if (!data) |
1417 | goto out; | ||
1418 | |||
1419 | if (!data->nr_pages) | ||
1420 | goto out; | ||
1421 | |||
1422 | ret = -EINVAL; | ||
1423 | if (size > PAGE_SIZE) | ||
1424 | goto out; | ||
1425 | |||
1426 | do { | ||
1427 | offset = head = atomic_read(&data->head); | ||
1428 | head += sizeof(u64); | ||
1429 | } while (atomic_cmpxchg(&data->head, offset, head) != offset); | ||
1430 | |||
1431 | wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); | ||
1445 | 1432 | ||
1446 | *p = data; | 1433 | nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); |
1447 | irqdata->len += sizeof(u64); | 1434 | offset &= PAGE_SIZE - 1; |
1435 | |||
1436 | len = min_t(unsigned int, PAGE_SIZE - offset, size); | ||
1437 | memcpy(data->data_pages[nr] + offset, buf, len); | ||
1438 | size -= len; | ||
1439 | |||
1440 | if (size) { | ||
1441 | nr = (nr + 1) & (data->nr_pages - 1); | ||
1442 | memcpy(data->data_pages[nr], buf + len, size); | ||
1443 | } | ||
1444 | |||
1445 | /* | ||
1446 | * generate a poll() wakeup for every page boundary crossed | ||
1447 | */ | ||
1448 | if (wakeup) { | ||
1449 | __perf_counter_update_userpage(counter, data); | ||
1450 | if (nmi) { | ||
1451 | counter->wakeup_pending = 1; | ||
1452 | set_perf_counter_pending(); | ||
1453 | } else | ||
1454 | wake_up(&counter->waitq); | ||
1448 | } | 1455 | } |
1456 | ret = 0; | ||
1457 | out: | ||
1458 | rcu_read_unlock(); | ||
1459 | |||
1460 | return ret; | ||
1449 | } | 1461 | } |
1450 | 1462 | ||
1451 | static void perf_counter_handle_group(struct perf_counter *counter) | 1463 | static void perf_output_simple(struct perf_counter *counter, |
1464 | int nmi, struct pt_regs *regs) | ||
1465 | { | ||
1466 | u64 entry; | ||
1467 | |||
1468 | entry = instruction_pointer(regs); | ||
1469 | |||
1470 | perf_output_write(counter, nmi, &entry, sizeof(entry)); | ||
1471 | } | ||
1472 | |||
1473 | struct group_entry { | ||
1474 | u64 event; | ||
1475 | u64 counter; | ||
1476 | }; | ||
1477 | |||
1478 | static void perf_output_group(struct perf_counter *counter, int nmi) | ||
1452 | { | 1479 | { |
1453 | struct perf_counter *leader, *sub; | 1480 | struct perf_counter *leader, *sub; |
1454 | 1481 | ||
1455 | leader = counter->group_leader; | 1482 | leader = counter->group_leader; |
1456 | list_for_each_entry(sub, &leader->sibling_list, list_entry) { | 1483 | list_for_each_entry(sub, &leader->sibling_list, list_entry) { |
1484 | struct group_entry entry; | ||
1485 | |||
1457 | if (sub != counter) | 1486 | if (sub != counter) |
1458 | sub->hw_ops->read(sub); | 1487 | sub->hw_ops->read(sub); |
1459 | perf_counter_store_irq(counter, sub->hw_event.config); | 1488 | |
1460 | perf_counter_store_irq(counter, atomic64_read(&sub->count)); | 1489 | entry.event = sub->hw_event.config; |
1490 | entry.counter = atomic64_read(&sub->count); | ||
1491 | |||
1492 | perf_output_write(counter, nmi, &entry, sizeof(entry)); | ||
1461 | } | 1493 | } |
1462 | } | 1494 | } |
1463 | 1495 | ||
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter, | |||
1469 | return; | 1501 | return; |
1470 | 1502 | ||
1471 | case PERF_RECORD_IRQ: | 1503 | case PERF_RECORD_IRQ: |
1472 | perf_counter_store_irq(counter, instruction_pointer(regs)); | 1504 | perf_output_simple(counter, nmi, regs); |
1473 | break; | 1505 | break; |
1474 | 1506 | ||
1475 | case PERF_RECORD_GROUP: | 1507 | case PERF_RECORD_GROUP: |
1476 | perf_counter_handle_group(counter); | 1508 | perf_output_group(counter, nmi); |
1477 | break; | 1509 | break; |
1478 | } | 1510 | } |
1479 | |||
1480 | if (nmi) { | ||
1481 | counter->wakeup_pending = 1; | ||
1482 | set_perf_counter_pending(); | ||
1483 | } else | ||
1484 | wake_up(&counter->waitq); | ||
1485 | } | 1511 | } |
1486 | 1512 | ||
1487 | /* | 1513 | /* |
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, | |||
1967 | INIT_LIST_HEAD(&counter->sibling_list); | 1993 | INIT_LIST_HEAD(&counter->sibling_list); |
1968 | init_waitqueue_head(&counter->waitq); | 1994 | init_waitqueue_head(&counter->waitq); |
1969 | 1995 | ||
1996 | mutex_init(&counter->mmap_mutex); | ||
1997 | |||
1970 | INIT_LIST_HEAD(&counter->child_list); | 1998 | INIT_LIST_HEAD(&counter->child_list); |
1971 | 1999 | ||
1972 | counter->irqdata = &counter->data[0]; | ||
1973 | counter->usrdata = &counter->data[1]; | ||
1974 | counter->cpu = cpu; | 2000 | counter->cpu = cpu; |
1975 | counter->hw_event = *hw_event; | 2001 | counter->hw_event = *hw_event; |
1976 | counter->wakeup_pending = 0; | 2002 | counter->wakeup_pending = 0; |