aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c464
1 files changed, 245 insertions, 219 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd902140e..0dfe91094fd1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 * 6 *
7 * For licencing details see kernel-base/COPYING 7 *
8 * For licensing details see kernel-base/COPYING
8 */ 9 */
9 10
10#include <linux/fs.h> 11#include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
1022 return atomic64_read(&counter->count); 1023 return atomic64_read(&counter->count);
1023} 1024}
1024 1025
1025/*
1026 * Cross CPU call to switch performance data pointers
1027 */
1028static void __perf_switch_irq_data(void *info)
1029{
1030 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1031 struct perf_counter *counter = info;
1032 struct perf_counter_context *ctx = counter->ctx;
1033 struct perf_data *oldirqdata = counter->irqdata;
1034
1035 /*
1036 * If this is a task context, we need to check whether it is
1037 * the current task context of this cpu. If not it has been
1038 * scheduled out before the smp call arrived.
1039 */
1040 if (ctx->task) {
1041 if (cpuctx->task_ctx != ctx)
1042 return;
1043 spin_lock(&ctx->lock);
1044 }
1045
1046 /* Change the pointer NMI safe */
1047 atomic_long_set((atomic_long_t *)&counter->irqdata,
1048 (unsigned long) counter->usrdata);
1049 counter->usrdata = oldirqdata;
1050
1051 if (ctx->task)
1052 spin_unlock(&ctx->lock);
1053}
1054
1055static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1056{
1057 struct perf_counter_context *ctx = counter->ctx;
1058 struct perf_data *oldirqdata = counter->irqdata;
1059 struct task_struct *task = ctx->task;
1060
1061 if (!task) {
1062 smp_call_function_single(counter->cpu,
1063 __perf_switch_irq_data,
1064 counter, 1);
1065 return counter->usrdata;
1066 }
1067
1068retry:
1069 spin_lock_irq(&ctx->lock);
1070 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1071 counter->irqdata = counter->usrdata;
1072 counter->usrdata = oldirqdata;
1073 spin_unlock_irq(&ctx->lock);
1074 return oldirqdata;
1075 }
1076 spin_unlock_irq(&ctx->lock);
1077 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1078 /* Might have failed, because task was scheduled out */
1079 if (counter->irqdata == oldirqdata)
1080 goto retry;
1081
1082 return counter->usrdata;
1083}
1084
1085static void put_context(struct perf_counter_context *ctx) 1026static void put_context(struct perf_counter_context *ctx)
1086{ 1027{
1087 if (ctx->task) 1028 if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
1177 mutex_unlock(&counter->mutex); 1118 mutex_unlock(&counter->mutex);
1178 mutex_unlock(&ctx->mutex); 1119 mutex_unlock(&ctx->mutex);
1179 1120
1180 free_page(counter->user_page);
1181 free_counter(counter); 1121 free_counter(counter);
1182 put_context(ctx); 1122 put_context(ctx);
1183 1123
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1192{ 1132{
1193 u64 cntval; 1133 u64 cntval;
1194 1134
1195 if (count != sizeof(cntval)) 1135 if (count < sizeof(cntval))
1196 return -EINVAL; 1136 return -EINVAL;
1197 1137
1198 /* 1138 /*
@@ -1211,121 +1151,20 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1211} 1151}
1212 1152
1213static ssize_t 1153static ssize_t
1214perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1215{
1216 if (!usrdata->len)
1217 return 0;
1218
1219 count = min(count, (size_t)usrdata->len);
1220 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1221 return -EFAULT;
1222
1223 /* Adjust the counters */
1224 usrdata->len -= count;
1225 if (!usrdata->len)
1226 usrdata->rd_idx = 0;
1227 else
1228 usrdata->rd_idx += count;
1229
1230 return count;
1231}
1232
1233static ssize_t
1234perf_read_irq_data(struct perf_counter *counter,
1235 char __user *buf,
1236 size_t count,
1237 int nonblocking)
1238{
1239 struct perf_data *irqdata, *usrdata;
1240 DECLARE_WAITQUEUE(wait, current);
1241 ssize_t res, res2;
1242
1243 irqdata = counter->irqdata;
1244 usrdata = counter->usrdata;
1245
1246 if (usrdata->len + irqdata->len >= count)
1247 goto read_pending;
1248
1249 if (nonblocking)
1250 return -EAGAIN;
1251
1252 spin_lock_irq(&counter->waitq.lock);
1253 __add_wait_queue(&counter->waitq, &wait);
1254 for (;;) {
1255 set_current_state(TASK_INTERRUPTIBLE);
1256 if (usrdata->len + irqdata->len >= count)
1257 break;
1258
1259 if (signal_pending(current))
1260 break;
1261
1262 if (counter->state == PERF_COUNTER_STATE_ERROR)
1263 break;
1264
1265 spin_unlock_irq(&counter->waitq.lock);
1266 schedule();
1267 spin_lock_irq(&counter->waitq.lock);
1268 }
1269 __remove_wait_queue(&counter->waitq, &wait);
1270 __set_current_state(TASK_RUNNING);
1271 spin_unlock_irq(&counter->waitq.lock);
1272
1273 if (usrdata->len + irqdata->len < count &&
1274 counter->state != PERF_COUNTER_STATE_ERROR)
1275 return -ERESTARTSYS;
1276read_pending:
1277 mutex_lock(&counter->mutex);
1278
1279 /* Drain pending data first: */
1280 res = perf_copy_usrdata(usrdata, buf, count);
1281 if (res < 0 || res == count)
1282 goto out;
1283
1284 /* Switch irq buffer: */
1285 usrdata = perf_switch_irq_data(counter);
1286 res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1287 if (res2 < 0) {
1288 if (!res)
1289 res = -EFAULT;
1290 } else {
1291 res += res2;
1292 }
1293out:
1294 mutex_unlock(&counter->mutex);
1295
1296 return res;
1297}
1298
1299static ssize_t
1300perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 1154perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1301{ 1155{
1302 struct perf_counter *counter = file->private_data; 1156 struct perf_counter *counter = file->private_data;
1303 1157
1304 switch (counter->hw_event.record_type) { 1158 return perf_read_hw(counter, buf, count);
1305 case PERF_RECORD_SIMPLE:
1306 return perf_read_hw(counter, buf, count);
1307
1308 case PERF_RECORD_IRQ:
1309 case PERF_RECORD_GROUP:
1310 return perf_read_irq_data(counter, buf, count,
1311 file->f_flags & O_NONBLOCK);
1312 }
1313 return -EINVAL;
1314} 1159}
1315 1160
1316static unsigned int perf_poll(struct file *file, poll_table *wait) 1161static unsigned int perf_poll(struct file *file, poll_table *wait)
1317{ 1162{
1318 struct perf_counter *counter = file->private_data; 1163 struct perf_counter *counter = file->private_data;
1319 unsigned int events = 0; 1164 unsigned int events = POLLIN;
1320 unsigned long flags;
1321 1165
1322 poll_wait(file, &counter->waitq, wait); 1166 poll_wait(file, &counter->waitq, wait);
1323 1167
1324 spin_lock_irqsave(&counter->waitq.lock, flags);
1325 if (counter->usrdata->len || counter->irqdata->len)
1326 events |= POLLIN;
1327 spin_unlock_irqrestore(&counter->waitq.lock, flags);
1328
1329 return events; 1168 return events;
1330} 1169}
1331 1170
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1347 return err; 1186 return err;
1348} 1187}
1349 1188
1350void perf_counter_update_userpage(struct perf_counter *counter) 1189static void __perf_counter_update_userpage(struct perf_counter *counter,
1190 struct perf_mmap_data *data)
1351{ 1191{
1352 struct perf_counter_mmap_page *userpg; 1192 struct perf_counter_mmap_page *userpg = data->user_page;
1353
1354 if (!counter->user_page)
1355 return;
1356 userpg = (struct perf_counter_mmap_page *) counter->user_page;
1357 1193
1194 /*
1195 * Disable preemption so as to not let the corresponding user-space
1196 * spin too long if we get preempted.
1197 */
1198 preempt_disable();
1358 ++userpg->lock; 1199 ++userpg->lock;
1359 smp_wmb(); 1200 smp_wmb();
1360 userpg->index = counter->hw.idx; 1201 userpg->index = counter->hw.idx;
1361 userpg->offset = atomic64_read(&counter->count); 1202 userpg->offset = atomic64_read(&counter->count);
1362 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1203 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1363 userpg->offset -= atomic64_read(&counter->hw.prev_count); 1204 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1205
1206 userpg->data_head = atomic_read(&data->head);
1364 smp_wmb(); 1207 smp_wmb();
1365 ++userpg->lock; 1208 ++userpg->lock;
1209 preempt_enable();
1210}
1211
1212void perf_counter_update_userpage(struct perf_counter *counter)
1213{
1214 struct perf_mmap_data *data;
1215
1216 rcu_read_lock();
1217 data = rcu_dereference(counter->data);
1218 if (data)
1219 __perf_counter_update_userpage(counter, data);
1220 rcu_read_unlock();
1366} 1221}
1367 1222
1368static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1223static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1369{ 1224{
1370 struct perf_counter *counter = vma->vm_file->private_data; 1225 struct perf_counter *counter = vma->vm_file->private_data;
1226 struct perf_mmap_data *data;
1227 int ret = VM_FAULT_SIGBUS;
1371 1228
1372 if (!counter->user_page) 1229 rcu_read_lock();
1373 return VM_FAULT_SIGBUS; 1230 data = rcu_dereference(counter->data);
1231 if (!data)
1232 goto unlock;
1233
1234 if (vmf->pgoff == 0) {
1235 vmf->page = virt_to_page(data->user_page);
1236 } else {
1237 int nr = vmf->pgoff - 1;
1374 1238
1375 vmf->page = virt_to_page(counter->user_page); 1239 if ((unsigned)nr > data->nr_pages)
1240 goto unlock;
1241
1242 vmf->page = virt_to_page(data->data_pages[nr]);
1243 }
1376 get_page(vmf->page); 1244 get_page(vmf->page);
1245 ret = 0;
1246unlock:
1247 rcu_read_unlock();
1248
1249 return ret;
1250}
1251
1252static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1253{
1254 struct perf_mmap_data *data;
1255 unsigned long size;
1256 int i;
1257
1258 WARN_ON(atomic_read(&counter->mmap_count));
1259
1260 size = sizeof(struct perf_mmap_data);
1261 size += nr_pages * sizeof(void *);
1262
1263 data = kzalloc(size, GFP_KERNEL);
1264 if (!data)
1265 goto fail;
1266
1267 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1268 if (!data->user_page)
1269 goto fail_user_page;
1270
1271 for (i = 0; i < nr_pages; i++) {
1272 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1273 if (!data->data_pages[i])
1274 goto fail_data_pages;
1275 }
1276
1277 data->nr_pages = nr_pages;
1278
1279 rcu_assign_pointer(counter->data, data);
1280
1377 return 0; 1281 return 0;
1282
1283fail_data_pages:
1284 for (i--; i >= 0; i--)
1285 free_page((unsigned long)data->data_pages[i]);
1286
1287 free_page((unsigned long)data->user_page);
1288
1289fail_user_page:
1290 kfree(data);
1291
1292fail:
1293 return -ENOMEM;
1294}
1295
1296static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1297{
1298 struct perf_mmap_data *data = container_of(rcu_head,
1299 struct perf_mmap_data, rcu_head);
1300 int i;
1301
1302 free_page((unsigned long)data->user_page);
1303 for (i = 0; i < data->nr_pages; i++)
1304 free_page((unsigned long)data->data_pages[i]);
1305 kfree(data);
1306}
1307
1308static void perf_mmap_data_free(struct perf_counter *counter)
1309{
1310 struct perf_mmap_data *data = counter->data;
1311
1312 WARN_ON(atomic_read(&counter->mmap_count));
1313
1314 rcu_assign_pointer(counter->data, NULL);
1315 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1316}
1317
1318static void perf_mmap_open(struct vm_area_struct *vma)
1319{
1320 struct perf_counter *counter = vma->vm_file->private_data;
1321
1322 atomic_inc(&counter->mmap_count);
1323}
1324
1325static void perf_mmap_close(struct vm_area_struct *vma)
1326{
1327 struct perf_counter *counter = vma->vm_file->private_data;
1328
1329 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1330 &counter->mmap_mutex)) {
1331 perf_mmap_data_free(counter);
1332 mutex_unlock(&counter->mmap_mutex);
1333 }
1378} 1334}
1379 1335
1380static struct vm_operations_struct perf_mmap_vmops = { 1336static struct vm_operations_struct perf_mmap_vmops = {
1337 .open = perf_mmap_open,
1338 .close = perf_mmap_close,
1381 .fault = perf_mmap_fault, 1339 .fault = perf_mmap_fault,
1382}; 1340};
1383 1341
1384static int perf_mmap(struct file *file, struct vm_area_struct *vma) 1342static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1385{ 1343{
1386 struct perf_counter *counter = file->private_data; 1344 struct perf_counter *counter = file->private_data;
1387 unsigned long userpg; 1345 unsigned long vma_size;
1346 unsigned long nr_pages;
1347 unsigned long locked, lock_limit;
1348 int ret = 0;
1388 1349
1389 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 1350 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1390 return -EINVAL; 1351 return -EINVAL;
1391 if (vma->vm_end - vma->vm_start != PAGE_SIZE) 1352
1353 vma_size = vma->vm_end - vma->vm_start;
1354 nr_pages = (vma_size / PAGE_SIZE) - 1;
1355
1356 if (nr_pages == 0 || !is_power_of_2(nr_pages))
1392 return -EINVAL; 1357 return -EINVAL;
1393 1358
1394 /* 1359 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1395 * For now, restrict to the case of a hardware counter
1396 * on the current task.
1397 */
1398 if (is_software_counter(counter) || counter->task != current)
1399 return -EINVAL; 1360 return -EINVAL;
1400 1361
1401 userpg = counter->user_page; 1362 if (vma->vm_pgoff != 0)
1402 if (!userpg) { 1363 return -EINVAL;
1403 userpg = get_zeroed_page(GFP_KERNEL); 1364
1404 mutex_lock(&counter->mutex); 1365 locked = vma_size >> PAGE_SHIFT;
1405 if (counter->user_page) { 1366 locked += vma->vm_mm->locked_vm;
1406 free_page(userpg);
1407 userpg = counter->user_page;
1408 } else {
1409 counter->user_page = userpg;
1410 }
1411 mutex_unlock(&counter->mutex);
1412 if (!userpg)
1413 return -ENOMEM;
1414 }
1415 1367
1416 perf_counter_update_userpage(counter); 1368 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1369 lock_limit >>= PAGE_SHIFT;
1370
1371 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
1372 return -EPERM;
1373
1374 mutex_lock(&counter->mmap_mutex);
1375 if (atomic_inc_not_zero(&counter->mmap_count))
1376 goto out;
1377
1378 WARN_ON(counter->data);
1379 ret = perf_mmap_data_alloc(counter, nr_pages);
1380 if (!ret)
1381 atomic_set(&counter->mmap_count, 1);
1382out:
1383 mutex_unlock(&counter->mmap_mutex);
1417 1384
1418 vma->vm_flags &= ~VM_MAYWRITE; 1385 vma->vm_flags &= ~VM_MAYWRITE;
1419 vma->vm_flags |= VM_RESERVED; 1386 vma->vm_flags |= VM_RESERVED;
1420 vma->vm_ops = &perf_mmap_vmops; 1387 vma->vm_ops = &perf_mmap_vmops;
1421 return 0; 1388
1389 return ret;
1422} 1390}
1423 1391
1424static const struct file_operations perf_fops = { 1392static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
1434 * Output 1402 * Output
1435 */ 1403 */
1436 1404
1437static void perf_counter_store_irq(struct perf_counter *counter, u64 data) 1405static int perf_output_write(struct perf_counter *counter, int nmi,
1406 void *buf, ssize_t size)
1438{ 1407{
1439 struct perf_data *irqdata = counter->irqdata; 1408 struct perf_mmap_data *data;
1409 unsigned int offset, head, nr;
1410 unsigned int len;
1411 int ret, wakeup;
1440 1412
1441 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { 1413 rcu_read_lock();
1442 irqdata->overrun++; 1414 ret = -ENOSPC;
1443 } else { 1415 data = rcu_dereference(counter->data);
1444 u64 *p = (u64 *) &irqdata->data[irqdata->len]; 1416 if (!data)
1417 goto out;
1418
1419 if (!data->nr_pages)
1420 goto out;
1421
1422 ret = -EINVAL;
1423 if (size > PAGE_SIZE)
1424 goto out;
1425
1426 do {
1427 offset = head = atomic_read(&data->head);
1428 head += sizeof(u64);
1429 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1430
1431 wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1445 1432
1446 *p = data; 1433 nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
1447 irqdata->len += sizeof(u64); 1434 offset &= PAGE_SIZE - 1;
1435
1436 len = min_t(unsigned int, PAGE_SIZE - offset, size);
1437 memcpy(data->data_pages[nr] + offset, buf, len);
1438 size -= len;
1439
1440 if (size) {
1441 nr = (nr + 1) & (data->nr_pages - 1);
1442 memcpy(data->data_pages[nr], buf + len, size);
1443 }
1444
1445 /*
1446 * generate a poll() wakeup for every page boundary crossed
1447 */
1448 if (wakeup) {
1449 __perf_counter_update_userpage(counter, data);
1450 if (nmi) {
1451 counter->wakeup_pending = 1;
1452 set_perf_counter_pending();
1453 } else
1454 wake_up(&counter->waitq);
1448 } 1455 }
1456 ret = 0;
1457out:
1458 rcu_read_unlock();
1459
1460 return ret;
1449} 1461}
1450 1462
1451static void perf_counter_handle_group(struct perf_counter *counter) 1463static void perf_output_simple(struct perf_counter *counter,
1464 int nmi, struct pt_regs *regs)
1465{
1466 u64 entry;
1467
1468 entry = instruction_pointer(regs);
1469
1470 perf_output_write(counter, nmi, &entry, sizeof(entry));
1471}
1472
1473struct group_entry {
1474 u64 event;
1475 u64 counter;
1476};
1477
1478static void perf_output_group(struct perf_counter *counter, int nmi)
1452{ 1479{
1453 struct perf_counter *leader, *sub; 1480 struct perf_counter *leader, *sub;
1454 1481
1455 leader = counter->group_leader; 1482 leader = counter->group_leader;
1456 list_for_each_entry(sub, &leader->sibling_list, list_entry) { 1483 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1484 struct group_entry entry;
1485
1457 if (sub != counter) 1486 if (sub != counter)
1458 sub->hw_ops->read(sub); 1487 sub->hw_ops->read(sub);
1459 perf_counter_store_irq(counter, sub->hw_event.config); 1488
1460 perf_counter_store_irq(counter, atomic64_read(&sub->count)); 1489 entry.event = sub->hw_event.config;
1490 entry.counter = atomic64_read(&sub->count);
1491
1492 perf_output_write(counter, nmi, &entry, sizeof(entry));
1461 } 1493 }
1462} 1494}
1463 1495
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
1469 return; 1501 return;
1470 1502
1471 case PERF_RECORD_IRQ: 1503 case PERF_RECORD_IRQ:
1472 perf_counter_store_irq(counter, instruction_pointer(regs)); 1504 perf_output_simple(counter, nmi, regs);
1473 break; 1505 break;
1474 1506
1475 case PERF_RECORD_GROUP: 1507 case PERF_RECORD_GROUP:
1476 perf_counter_handle_group(counter); 1508 perf_output_group(counter, nmi);
1477 break; 1509 break;
1478 } 1510 }
1479
1480 if (nmi) {
1481 counter->wakeup_pending = 1;
1482 set_perf_counter_pending();
1483 } else
1484 wake_up(&counter->waitq);
1485} 1511}
1486 1512
1487/* 1513/*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1967 INIT_LIST_HEAD(&counter->sibling_list); 1993 INIT_LIST_HEAD(&counter->sibling_list);
1968 init_waitqueue_head(&counter->waitq); 1994 init_waitqueue_head(&counter->waitq);
1969 1995
1996 mutex_init(&counter->mmap_mutex);
1997
1970 INIT_LIST_HEAD(&counter->child_list); 1998 INIT_LIST_HEAD(&counter->child_list);
1971 1999
1972 counter->irqdata = &counter->data[0];
1973 counter->usrdata = &counter->data[1];
1974 counter->cpu = cpu; 2000 counter->cpu = cpu;
1975 counter->hw_event = *hw_event; 2001 counter->hw_event = *hw_event;
1976 counter->wakeup_pending = 0; 2002 counter->wakeup_pending = 0;