aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c1396
1 files changed, 959 insertions, 437 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551aa..d7cbc579fc80 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
@@ -49,7 +50,7 @@ static atomic_t nr_comm_counters __read_mostly;
49 * 1 - disallow cpu counters to unpriv 50 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv 51 * 2 - disallow kernel profiling to unpriv
51 */ 52 */
52int sysctl_perf_counter_paranoid __read_mostly; 53int sysctl_perf_counter_paranoid __read_mostly = 1;
53 54
54static inline bool perf_paranoid_cpu(void) 55static inline bool perf_paranoid_cpu(void)
55{ 56{
@@ -87,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 88void __weak hw_perf_enable(void) { barrier(); }
88 89
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 92
91int __weak 93int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 94hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -124,7 +126,7 @@ void perf_enable(void)
124 126
125static void get_ctx(struct perf_counter_context *ctx) 127static void get_ctx(struct perf_counter_context *ctx)
126{ 128{
127 atomic_inc(&ctx->refcount); 129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128} 130}
129 131
130static void free_ctx(struct rcu_head *head) 132static void free_ctx(struct rcu_head *head)
@@ -146,6 +148,28 @@ static void put_ctx(struct perf_counter_context *ctx)
146 } 148 }
147} 149}
148 150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
149/* 173/*
150 * Get the perf_counter_context for a task and lock it. 174 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked, 175 * This has to cope with with the fact that until it is locked,
@@ -175,6 +199,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
175 spin_unlock_irqrestore(&ctx->lock, *flags); 199 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry; 200 goto retry;
177 } 201 }
202
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
206 }
178 } 207 }
179 rcu_read_unlock(); 208 rcu_read_unlock();
180 return ctx; 209 return ctx;
@@ -193,7 +222,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
193 ctx = perf_lock_task_context(task, &flags); 222 ctx = perf_lock_task_context(task, &flags);
194 if (ctx) { 223 if (ctx) {
195 ++ctx->pin_count; 224 ++ctx->pin_count;
196 get_ctx(ctx);
197 spin_unlock_irqrestore(&ctx->lock, flags); 225 spin_unlock_irqrestore(&ctx->lock, flags);
198 } 226 }
199 return ctx; 227 return ctx;
@@ -232,6 +260,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
232 260
233 list_add_rcu(&counter->event_entry, &ctx->event_list); 261 list_add_rcu(&counter->event_entry, &ctx->event_list);
234 ctx->nr_counters++; 262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
235} 265}
236 266
237/* 267/*
@@ -246,6 +276,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246 if (list_empty(&counter->list_entry)) 276 if (list_empty(&counter->list_entry))
247 return; 277 return;
248 ctx->nr_counters--; 278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
249 281
250 list_del_init(&counter->list_entry); 282 list_del_init(&counter->list_entry);
251 list_del_rcu(&counter->event_entry); 283 list_del_rcu(&counter->event_entry);
@@ -275,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
275 return; 307 return;
276 308
277 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
278 counter->tstamp_stopped = ctx->time; 314 counter->tstamp_stopped = ctx->time;
279 counter->pmu->disable(counter); 315 counter->pmu->disable(counter);
280 counter->oncpu = -1; 316 counter->oncpu = -1;
@@ -1002,6 +1038,81 @@ static int context_equiv(struct perf_counter_context *ctx1,
1002 && !ctx1->pin_count && !ctx2->pin_count; 1038 && !ctx1->pin_count && !ctx2->pin_count;
1003} 1039}
1004 1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1005/* 1116/*
1006 * Called from scheduler to remove the counters of the current task, 1117 * Called from scheduler to remove the counters of the current task,
1007 * with interrupts disabled. 1118 * with interrupts disabled.
@@ -1057,6 +1168,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
1057 ctx->task = next; 1168 ctx->task = next;
1058 next_ctx->task = task; 1169 next_ctx->task = task;
1059 do_switch = 0; 1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1060 } 1173 }
1061 spin_unlock(&next_ctx->lock); 1174 spin_unlock(&next_ctx->lock);
1062 spin_unlock(&ctx->lock); 1175 spin_unlock(&ctx->lock);
@@ -1203,7 +1316,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1203#define MAX_INTERRUPTS (~0ULL) 1316#define MAX_INTERRUPTS (~0ULL)
1204 1317
1205static void perf_log_throttle(struct perf_counter *counter, int enable); 1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1206static void perf_log_period(struct perf_counter *counter, u64 period);
1207 1319
1208static void perf_adjust_period(struct perf_counter *counter, u64 events) 1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1209{ 1321{
@@ -1222,8 +1334,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
1222 if (!sample_period) 1334 if (!sample_period)
1223 sample_period = 1; 1335 sample_period = 1;
1224 1336
1225 perf_log_period(counter, sample_period);
1226
1227 hwc->sample_period = sample_period; 1337 hwc->sample_period = sample_period;
1228} 1338}
1229 1339
@@ -1283,7 +1393,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1283 if (!interrupts) { 1393 if (!interrupts) {
1284 perf_disable(); 1394 perf_disable();
1285 counter->pmu->disable(counter); 1395 counter->pmu->disable(counter);
1286 atomic_set(&hwc->period_left, 0); 1396 atomic64_set(&hwc->period_left, 0);
1287 counter->pmu->enable(counter); 1397 counter->pmu->enable(counter);
1288 perf_enable(); 1398 perf_enable();
1289 } 1399 }
@@ -1344,14 +1454,70 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1344} 1454}
1345 1455
1346/* 1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1347 * Cross CPU call to read the hardware counter 1502 * Cross CPU call to read the hardware counter
1348 */ 1503 */
1349static void __read(void *info) 1504static void __perf_counter_read(void *info)
1350{ 1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1351 struct perf_counter *counter = info; 1507 struct perf_counter *counter = info;
1352 struct perf_counter_context *ctx = counter->ctx; 1508 struct perf_counter_context *ctx = counter->ctx;
1353 unsigned long flags; 1509 unsigned long flags;
1354 1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1355 local_irq_save(flags); 1521 local_irq_save(flags);
1356 if (ctx->is_active) 1522 if (ctx->is_active)
1357 update_context_time(ctx); 1523 update_context_time(ctx);
@@ -1368,7 +1534,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
1368 */ 1534 */
1369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) { 1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1370 smp_call_function_single(counter->oncpu, 1536 smp_call_function_single(counter->oncpu,
1371 __read, counter, 1); 1537 __perf_counter_read, counter, 1);
1372 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1373 update_counter_times(counter); 1539 update_counter_times(counter);
1374 } 1540 }
@@ -1394,7 +1560,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
1394 1560
1395static struct perf_counter_context *find_get_context(pid_t pid, int cpu) 1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1396{ 1562{
1397 struct perf_counter_context *parent_ctx;
1398 struct perf_counter_context *ctx; 1563 struct perf_counter_context *ctx;
1399 struct perf_cpu_context *cpuctx; 1564 struct perf_cpu_context *cpuctx;
1400 struct task_struct *task; 1565 struct task_struct *task;
@@ -1454,16 +1619,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1454 retry: 1619 retry:
1455 ctx = perf_lock_task_context(task, &flags); 1620 ctx = perf_lock_task_context(task, &flags);
1456 if (ctx) { 1621 if (ctx) {
1457 parent_ctx = ctx->parent_ctx; 1622 unclone_ctx(ctx);
1458 if (parent_ctx) {
1459 put_ctx(parent_ctx);
1460 ctx->parent_ctx = NULL; /* no longer a clone */
1461 }
1462 /*
1463 * Get an extra reference before dropping the lock so that
1464 * this context won't get freed if the task exits.
1465 */
1466 get_ctx(ctx);
1467 spin_unlock_irqrestore(&ctx->lock, flags); 1623 spin_unlock_irqrestore(&ctx->lock, flags);
1468 } 1624 }
1469 1625
@@ -1509,11 +1665,15 @@ static void free_counter(struct perf_counter *counter)
1509{ 1665{
1510 perf_pending_sync(counter); 1666 perf_pending_sync(counter);
1511 1667
1512 atomic_dec(&nr_counters); 1668 if (!counter->parent) {
1513 if (counter->attr.mmap) 1669 atomic_dec(&nr_counters);
1514 atomic_dec(&nr_mmap_counters); 1670 if (counter->attr.mmap)
1515 if (counter->attr.comm) 1671 atomic_dec(&nr_mmap_counters);
1516 atomic_dec(&nr_comm_counters); 1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1517 1677
1518 if (counter->destroy) 1678 if (counter->destroy)
1519 counter->destroy(counter); 1679 counter->destroy(counter);
@@ -1547,14 +1707,133 @@ static int perf_release(struct inode *inode, struct file *file)
1547 return 0; 1707 return 0;
1548} 1708}
1549 1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1550/* 1829/*
1551 * Read the performance counter - simple non blocking version for now 1830 * Read the performance counter - simple non blocking version for now
1552 */ 1831 */
1553static ssize_t 1832static ssize_t
1554perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1555{ 1834{
1556 u64 values[3]; 1835 u64 read_format = counter->attr.read_format;
1557 int n; 1836 int ret;
1558 1837
1559 /* 1838 /*
1560 * Return end-of-file for a read on a counter that is in 1839 * Return end-of-file for a read on a counter that is in
@@ -1564,28 +1843,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1564 if (counter->state == PERF_COUNTER_STATE_ERROR) 1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1565 return 0; 1844 return 0;
1566 1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1567 WARN_ON_ONCE(counter->ctx->parent_ctx); 1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1568 mutex_lock(&counter->child_mutex); 1850 mutex_lock(&counter->child_mutex);
1569 values[0] = perf_counter_read(counter); 1851 if (read_format & PERF_FORMAT_GROUP)
1570 n = 1; 1852 ret = perf_counter_read_group(counter, read_format, buf);
1571 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1853 else
1572 values[n++] = counter->total_time_enabled + 1854 ret = perf_counter_read_one(counter, read_format, buf);
1573 atomic64_read(&counter->child_total_time_enabled);
1574 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1575 values[n++] = counter->total_time_running +
1576 atomic64_read(&counter->child_total_time_running);
1577 if (counter->attr.read_format & PERF_FORMAT_ID)
1578 values[n++] = counter->id;
1579 mutex_unlock(&counter->child_mutex); 1855 mutex_unlock(&counter->child_mutex);
1580 1856
1581 if (count < n * sizeof(u64)) 1857 return ret;
1582 return -EINVAL;
1583 count = n * sizeof(u64);
1584
1585 if (copy_to_user(buf, values, count))
1586 return -EFAULT;
1587
1588 return count;
1589} 1858}
1590 1859
1591static ssize_t 1860static ssize_t
@@ -1620,22 +1889,6 @@ static void perf_counter_reset(struct perf_counter *counter)
1620 perf_counter_update_userpage(counter); 1889 perf_counter_update_userpage(counter);
1621} 1890}
1622 1891
1623static void perf_counter_for_each_sibling(struct perf_counter *counter,
1624 void (*func)(struct perf_counter *))
1625{
1626 struct perf_counter_context *ctx = counter->ctx;
1627 struct perf_counter *sibling;
1628
1629 WARN_ON_ONCE(ctx->parent_ctx);
1630 mutex_lock(&ctx->mutex);
1631 counter = counter->group_leader;
1632
1633 func(counter);
1634 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1635 func(sibling);
1636 mutex_unlock(&ctx->mutex);
1637}
1638
1639/* 1892/*
1640 * Holding the top-level counter's child_mutex means that any 1893 * Holding the top-level counter's child_mutex means that any
1641 * descendant process that has inherited this counter will block 1894 * descendant process that has inherited this counter will block
@@ -1658,14 +1911,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
1658static void perf_counter_for_each(struct perf_counter *counter, 1911static void perf_counter_for_each(struct perf_counter *counter,
1659 void (*func)(struct perf_counter *)) 1912 void (*func)(struct perf_counter *))
1660{ 1913{
1661 struct perf_counter *child; 1914 struct perf_counter_context *ctx = counter->ctx;
1915 struct perf_counter *sibling;
1662 1916
1663 WARN_ON_ONCE(counter->ctx->parent_ctx); 1917 WARN_ON_ONCE(ctx->parent_ctx);
1664 mutex_lock(&counter->child_mutex); 1918 mutex_lock(&ctx->mutex);
1665 perf_counter_for_each_sibling(counter, func); 1919 counter = counter->group_leader;
1666 list_for_each_entry(child, &counter->child_list, child_list) 1920
1667 perf_counter_for_each_sibling(child, func); 1921 perf_counter_for_each_child(counter, func);
1668 mutex_unlock(&counter->child_mutex); 1922 func(counter);
1923 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1924 perf_counter_for_each_child(counter, func);
1925 mutex_unlock(&ctx->mutex);
1669} 1926}
1670 1927
1671static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) 1928static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
@@ -1694,8 +1951,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1694 1951
1695 counter->attr.sample_freq = value; 1952 counter->attr.sample_freq = value;
1696 } else { 1953 } else {
1697 perf_log_period(counter, value);
1698
1699 counter->attr.sample_period = value; 1954 counter->attr.sample_period = value;
1700 counter->hw.sample_period = value; 1955 counter->hw.sample_period = value;
1701 } 1956 }
@@ -1764,6 +2019,18 @@ int perf_counter_task_disable(void)
1764 return 0; 2019 return 0;
1765} 2020}
1766 2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
1767/* 2034/*
1768 * Callers need to ensure there can be no nesting of this function, otherwise 2035 * Callers need to ensure there can be no nesting of this function, otherwise
1769 * the seqlock logic goes bad. We can not serialize this because the arch 2036 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1788,11 +2055,17 @@ void perf_counter_update_userpage(struct perf_counter *counter)
1788 preempt_disable(); 2055 preempt_disable();
1789 ++userpg->lock; 2056 ++userpg->lock;
1790 barrier(); 2057 barrier();
1791 userpg->index = counter->hw.idx; 2058 userpg->index = perf_counter_index(counter);
1792 userpg->offset = atomic64_read(&counter->count); 2059 userpg->offset = atomic64_read(&counter->count);
1793 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1794 userpg->offset -= atomic64_read(&counter->hw.prev_count); 2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1795 2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
1796 barrier(); 2069 barrier();
1797 ++userpg->lock; 2070 ++userpg->lock;
1798 preempt_enable(); 2071 preempt_enable();
@@ -1806,6 +2079,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1806 struct perf_mmap_data *data; 2079 struct perf_mmap_data *data;
1807 int ret = VM_FAULT_SIGBUS; 2080 int ret = VM_FAULT_SIGBUS;
1808 2081
2082 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2083 if (vmf->pgoff == 0)
2084 ret = 0;
2085 return ret;
2086 }
2087
1809 rcu_read_lock(); 2088 rcu_read_lock();
1810 data = rcu_dereference(counter->data); 2089 data = rcu_dereference(counter->data);
1811 if (!data) 2090 if (!data)
@@ -1819,9 +2098,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1819 if ((unsigned)nr > data->nr_pages) 2098 if ((unsigned)nr > data->nr_pages)
1820 goto unlock; 2099 goto unlock;
1821 2100
2101 if (vmf->flags & FAULT_FLAG_WRITE)
2102 goto unlock;
2103
1822 vmf->page = virt_to_page(data->data_pages[nr]); 2104 vmf->page = virt_to_page(data->data_pages[nr]);
1823 } 2105 }
2106
1824 get_page(vmf->page); 2107 get_page(vmf->page);
2108 vmf->page->mapping = vma->vm_file->f_mapping;
2109 vmf->page->index = vmf->pgoff;
2110
1825 ret = 0; 2111 ret = 0;
1826unlock: 2112unlock:
1827 rcu_read_unlock(); 2113 rcu_read_unlock();
@@ -1874,6 +2160,14 @@ fail:
1874 return -ENOMEM; 2160 return -ENOMEM;
1875} 2161}
1876 2162
2163static void perf_mmap_free_page(unsigned long addr)
2164{
2165 struct page *page = virt_to_page((void *)addr);
2166
2167 page->mapping = NULL;
2168 __free_page(page);
2169}
2170
1877static void __perf_mmap_data_free(struct rcu_head *rcu_head) 2171static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1878{ 2172{
1879 struct perf_mmap_data *data; 2173 struct perf_mmap_data *data;
@@ -1881,9 +2175,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1881 2175
1882 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2176 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1883 2177
1884 free_page((unsigned long)data->user_page); 2178 perf_mmap_free_page((unsigned long)data->user_page);
1885 for (i = 0; i < data->nr_pages; i++) 2179 for (i = 0; i < data->nr_pages; i++)
1886 free_page((unsigned long)data->data_pages[i]); 2180 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2181
1887 kfree(data); 2182 kfree(data);
1888} 2183}
1889 2184
@@ -1920,9 +2215,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
1920} 2215}
1921 2216
1922static struct vm_operations_struct perf_mmap_vmops = { 2217static struct vm_operations_struct perf_mmap_vmops = {
1923 .open = perf_mmap_open, 2218 .open = perf_mmap_open,
1924 .close = perf_mmap_close, 2219 .close = perf_mmap_close,
1925 .fault = perf_mmap_fault, 2220 .fault = perf_mmap_fault,
2221 .page_mkwrite = perf_mmap_fault,
1926}; 2222};
1927 2223
1928static int perf_mmap(struct file *file, struct vm_area_struct *vma) 2224static int perf_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1936,7 +2232,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1936 long user_extra, extra; 2232 long user_extra, extra;
1937 int ret = 0; 2233 int ret = 0;
1938 2234
1939 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) 2235 if (!(vma->vm_flags & VM_SHARED))
1940 return -EINVAL; 2236 return -EINVAL;
1941 2237
1942 vma_size = vma->vm_end - vma->vm_start; 2238 vma_size = vma->vm_end - vma->vm_start;
@@ -1995,10 +2291,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1995 atomic_long_add(user_extra, &user->locked_vm); 2291 atomic_long_add(user_extra, &user->locked_vm);
1996 vma->vm_mm->locked_vm += extra; 2292 vma->vm_mm->locked_vm += extra;
1997 counter->data->nr_locked = extra; 2293 counter->data->nr_locked = extra;
2294 if (vma->vm_flags & VM_WRITE)
2295 counter->data->writable = 1;
2296
1998unlock: 2297unlock:
1999 mutex_unlock(&counter->mmap_mutex); 2298 mutex_unlock(&counter->mmap_mutex);
2000 2299
2001 vma->vm_flags &= ~VM_MAYWRITE;
2002 vma->vm_flags |= VM_RESERVED; 2300 vma->vm_flags |= VM_RESERVED;
2003 vma->vm_ops = &perf_mmap_vmops; 2301 vma->vm_ops = &perf_mmap_vmops;
2004 2302
@@ -2064,7 +2362,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2064 2362
2065 if (counter->pending_disable) { 2363 if (counter->pending_disable) {
2066 counter->pending_disable = 0; 2364 counter->pending_disable = 0;
2067 perf_counter_disable(counter); 2365 __perf_counter_disable(counter);
2068 } 2366 }
2069 2367
2070 if (counter->pending_wakeup) { 2368 if (counter->pending_wakeup) {
@@ -2175,11 +2473,38 @@ struct perf_output_handle {
2175 unsigned long head; 2473 unsigned long head;
2176 unsigned long offset; 2474 unsigned long offset;
2177 int nmi; 2475 int nmi;
2178 int overflow; 2476 int sample;
2179 int locked; 2477 int locked;
2180 unsigned long flags; 2478 unsigned long flags;
2181}; 2479};
2182 2480
2481static bool perf_output_space(struct perf_mmap_data *data,
2482 unsigned int offset, unsigned int head)
2483{
2484 unsigned long tail;
2485 unsigned long mask;
2486
2487 if (!data->writable)
2488 return true;
2489
2490 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2491 /*
2492 * Userspace could choose to issue a mb() before updating the tail
2493 * pointer. So that all reads will be completed before the write is
2494 * issued.
2495 */
2496 tail = ACCESS_ONCE(data->user_page->data_tail);
2497 smp_rmb();
2498
2499 offset = (offset - tail) & mask;
2500 head = (head - tail) & mask;
2501
2502 if ((int)(head - offset) < 0)
2503 return false;
2504
2505 return true;
2506}
2507
2183static void perf_output_wakeup(struct perf_output_handle *handle) 2508static void perf_output_wakeup(struct perf_output_handle *handle)
2184{ 2509{
2185 atomic_set(&handle->data->poll, POLL_IN); 2510 atomic_set(&handle->data->poll, POLL_IN);
@@ -2270,12 +2595,57 @@ out:
2270 local_irq_restore(handle->flags); 2595 local_irq_restore(handle->flags);
2271} 2596}
2272 2597
2598static void perf_output_copy(struct perf_output_handle *handle,
2599 const void *buf, unsigned int len)
2600{
2601 unsigned int pages_mask;
2602 unsigned int offset;
2603 unsigned int size;
2604 void **pages;
2605
2606 offset = handle->offset;
2607 pages_mask = handle->data->nr_pages - 1;
2608 pages = handle->data->data_pages;
2609
2610 do {
2611 unsigned int page_offset;
2612 int nr;
2613
2614 nr = (offset >> PAGE_SHIFT) & pages_mask;
2615 page_offset = offset & (PAGE_SIZE - 1);
2616 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2617
2618 memcpy(pages[nr] + page_offset, buf, size);
2619
2620 len -= size;
2621 buf += size;
2622 offset += size;
2623 } while (len);
2624
2625 handle->offset = offset;
2626
2627 /*
2628 * Check we didn't copy past our reservation window, taking the
2629 * possible unsigned int wrap into account.
2630 */
2631 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2632}
2633
2634#define perf_output_put(handle, x) \
2635 perf_output_copy((handle), &(x), sizeof(x))
2636
2273static int perf_output_begin(struct perf_output_handle *handle, 2637static int perf_output_begin(struct perf_output_handle *handle,
2274 struct perf_counter *counter, unsigned int size, 2638 struct perf_counter *counter, unsigned int size,
2275 int nmi, int overflow) 2639 int nmi, int sample)
2276{ 2640{
2277 struct perf_mmap_data *data; 2641 struct perf_mmap_data *data;
2278 unsigned int offset, head; 2642 unsigned int offset, head;
2643 int have_lost;
2644 struct {
2645 struct perf_event_header header;
2646 u64 id;
2647 u64 lost;
2648 } lost_event;
2279 2649
2280 /* 2650 /*
2281 * For inherited counters we send all the output towards the parent. 2651 * For inherited counters we send all the output towards the parent.
@@ -2288,19 +2658,25 @@ static int perf_output_begin(struct perf_output_handle *handle,
2288 if (!data) 2658 if (!data)
2289 goto out; 2659 goto out;
2290 2660
2291 handle->data = data; 2661 handle->data = data;
2292 handle->counter = counter; 2662 handle->counter = counter;
2293 handle->nmi = nmi; 2663 handle->nmi = nmi;
2294 handle->overflow = overflow; 2664 handle->sample = sample;
2295 2665
2296 if (!data->nr_pages) 2666 if (!data->nr_pages)
2297 goto fail; 2667 goto fail;
2298 2668
2669 have_lost = atomic_read(&data->lost);
2670 if (have_lost)
2671 size += sizeof(lost_event);
2672
2299 perf_output_lock(handle); 2673 perf_output_lock(handle);
2300 2674
2301 do { 2675 do {
2302 offset = head = atomic_long_read(&data->head); 2676 offset = head = atomic_long_read(&data->head);
2303 head += size; 2677 head += size;
2678 if (unlikely(!perf_output_space(data, offset, head)))
2679 goto fail;
2304 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2680 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2305 2681
2306 handle->offset = offset; 2682 handle->offset = offset;
@@ -2309,55 +2685,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
2309 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2685 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2310 atomic_set(&data->wakeup, 1); 2686 atomic_set(&data->wakeup, 1);
2311 2687
2688 if (have_lost) {
2689 lost_event.header.type = PERF_EVENT_LOST;
2690 lost_event.header.misc = 0;
2691 lost_event.header.size = sizeof(lost_event);
2692 lost_event.id = counter->id;
2693 lost_event.lost = atomic_xchg(&data->lost, 0);
2694
2695 perf_output_put(handle, lost_event);
2696 }
2697
2312 return 0; 2698 return 0;
2313 2699
2314fail: 2700fail:
2315 perf_output_wakeup(handle); 2701 atomic_inc(&data->lost);
2702 perf_output_unlock(handle);
2316out: 2703out:
2317 rcu_read_unlock(); 2704 rcu_read_unlock();
2318 2705
2319 return -ENOSPC; 2706 return -ENOSPC;
2320} 2707}
2321 2708
2322static void perf_output_copy(struct perf_output_handle *handle,
2323 const void *buf, unsigned int len)
2324{
2325 unsigned int pages_mask;
2326 unsigned int offset;
2327 unsigned int size;
2328 void **pages;
2329
2330 offset = handle->offset;
2331 pages_mask = handle->data->nr_pages - 1;
2332 pages = handle->data->data_pages;
2333
2334 do {
2335 unsigned int page_offset;
2336 int nr;
2337
2338 nr = (offset >> PAGE_SHIFT) & pages_mask;
2339 page_offset = offset & (PAGE_SIZE - 1);
2340 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2341
2342 memcpy(pages[nr] + page_offset, buf, size);
2343
2344 len -= size;
2345 buf += size;
2346 offset += size;
2347 } while (len);
2348
2349 handle->offset = offset;
2350
2351 /*
2352 * Check we didn't copy past our reservation window, taking the
2353 * possible unsigned int wrap into account.
2354 */
2355 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2356}
2357
2358#define perf_output_put(handle, x) \
2359 perf_output_copy((handle), &(x), sizeof(x))
2360
2361static void perf_output_end(struct perf_output_handle *handle) 2709static void perf_output_end(struct perf_output_handle *handle)
2362{ 2710{
2363 struct perf_counter *counter = handle->counter; 2711 struct perf_counter *counter = handle->counter;
@@ -2365,7 +2713,7 @@ static void perf_output_end(struct perf_output_handle *handle)
2365 2713
2366 int wakeup_events = counter->attr.wakeup_events; 2714 int wakeup_events = counter->attr.wakeup_events;
2367 2715
2368 if (handle->overflow && wakeup_events) { 2716 if (handle->sample && wakeup_events) {
2369 int events = atomic_inc_return(&data->events); 2717 int events = atomic_inc_return(&data->events);
2370 if (events >= wakeup_events) { 2718 if (events >= wakeup_events) {
2371 atomic_sub(wakeup_events, &data->events); 2719 atomic_sub(wakeup_events, &data->events);
@@ -2399,7 +2747,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2399 return task_pid_nr_ns(p, counter->ns); 2747 return task_pid_nr_ns(p, counter->ns);
2400} 2748}
2401 2749
2402static void perf_counter_output(struct perf_counter *counter, int nmi, 2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2403 struct perf_sample_data *data) 2824 struct perf_sample_data *data)
2404{ 2825{
2405 int ret; 2826 int ret;
@@ -2410,10 +2831,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2410 struct { 2831 struct {
2411 u32 pid, tid; 2832 u32 pid, tid;
2412 } tid_entry; 2833 } tid_entry;
2413 struct {
2414 u64 id;
2415 u64 counter;
2416 } group_entry;
2417 struct perf_callchain_entry *callchain = NULL; 2834 struct perf_callchain_entry *callchain = NULL;
2418 int callchain_size = 0; 2835 int callchain_size = 0;
2419 u64 time; 2836 u64 time;
@@ -2421,15 +2838,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2421 u32 cpu, reserved; 2838 u32 cpu, reserved;
2422 } cpu_entry; 2839 } cpu_entry;
2423 2840
2424 header.type = 0; 2841 header.type = PERF_EVENT_SAMPLE;
2425 header.size = sizeof(header); 2842 header.size = sizeof(header);
2426 2843
2427 header.misc = PERF_EVENT_MISC_OVERFLOW; 2844 header.misc = 0;
2428 header.misc |= perf_misc_flags(data->regs); 2845 header.misc |= perf_misc_flags(data->regs);
2429 2846
2430 if (sample_type & PERF_SAMPLE_IP) { 2847 if (sample_type & PERF_SAMPLE_IP) {
2431 ip = perf_instruction_pointer(data->regs); 2848 ip = perf_instruction_pointer(data->regs);
2432 header.type |= PERF_SAMPLE_IP;
2433 header.size += sizeof(ip); 2849 header.size += sizeof(ip);
2434 } 2850 }
2435 2851
@@ -2438,7 +2854,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2438 tid_entry.pid = perf_counter_pid(counter, current); 2854 tid_entry.pid = perf_counter_pid(counter, current);
2439 tid_entry.tid = perf_counter_tid(counter, current); 2855 tid_entry.tid = perf_counter_tid(counter, current);
2440 2856
2441 header.type |= PERF_SAMPLE_TID;
2442 header.size += sizeof(tid_entry); 2857 header.size += sizeof(tid_entry);
2443 } 2858 }
2444 2859
@@ -2448,47 +2863,51 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2448 */ 2863 */
2449 time = sched_clock(); 2864 time = sched_clock();
2450 2865
2451 header.type |= PERF_SAMPLE_TIME;
2452 header.size += sizeof(u64); 2866 header.size += sizeof(u64);
2453 } 2867 }
2454 2868
2455 if (sample_type & PERF_SAMPLE_ADDR) { 2869 if (sample_type & PERF_SAMPLE_ADDR)
2456 header.type |= PERF_SAMPLE_ADDR;
2457 header.size += sizeof(u64); 2870 header.size += sizeof(u64);
2458 }
2459 2871
2460 if (sample_type & PERF_SAMPLE_ID) { 2872 if (sample_type & PERF_SAMPLE_ID)
2461 header.type |= PERF_SAMPLE_ID; 2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2462 header.size += sizeof(u64); 2876 header.size += sizeof(u64);
2463 }
2464 2877
2465 if (sample_type & PERF_SAMPLE_CPU) { 2878 if (sample_type & PERF_SAMPLE_CPU) {
2466 header.type |= PERF_SAMPLE_CPU;
2467 header.size += sizeof(cpu_entry); 2879 header.size += sizeof(cpu_entry);
2468 2880
2469 cpu_entry.cpu = raw_smp_processor_id(); 2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2470 } 2883 }
2471 2884
2472 if (sample_type & PERF_SAMPLE_PERIOD) { 2885 if (sample_type & PERF_SAMPLE_PERIOD)
2473 header.type |= PERF_SAMPLE_PERIOD;
2474 header.size += sizeof(u64); 2886 header.size += sizeof(u64);
2475 }
2476 2887
2477 if (sample_type & PERF_SAMPLE_GROUP) { 2888 if (sample_type & PERF_SAMPLE_READ)
2478 header.type |= PERF_SAMPLE_GROUP; 2889 header.size += perf_counter_read_size(counter);
2479 header.size += sizeof(u64) +
2480 counter->nr_siblings * sizeof(group_entry);
2481 }
2482 2890
2483 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2484 callchain = perf_callchain(data->regs); 2892 callchain = perf_callchain(data->regs);
2485 2893
2486 if (callchain) { 2894 if (callchain) {
2487 callchain_size = (1 + callchain->nr) * sizeof(u64); 2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2488
2489 header.type |= PERF_SAMPLE_CALLCHAIN;
2490 header.size += callchain_size; 2896 header.size += callchain_size;
2491 } 2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2492 } 2911 }
2493 2912
2494 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2509,7 +2928,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2509 if (sample_type & PERF_SAMPLE_ADDR) 2928 if (sample_type & PERF_SAMPLE_ADDR)
2510 perf_output_put(&handle, data->addr); 2929 perf_output_put(&handle, data->addr);
2511 2930
2512 if (sample_type & PERF_SAMPLE_ID) 2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2513 perf_output_put(&handle, counter->id); 2938 perf_output_put(&handle, counter->id);
2514 2939
2515 if (sample_type & PERF_SAMPLE_CPU) 2940 if (sample_type & PERF_SAMPLE_CPU)
@@ -2518,76 +2943,125 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2518 if (sample_type & PERF_SAMPLE_PERIOD) 2943 if (sample_type & PERF_SAMPLE_PERIOD)
2519 perf_output_put(&handle, data->period); 2944 perf_output_put(&handle, data->period);
2520 2945
2521 /* 2946 if (sample_type & PERF_SAMPLE_READ)
2522 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2947 perf_output_read(&handle, counter);
2523 */
2524 if (sample_type & PERF_SAMPLE_GROUP) {
2525 struct perf_counter *leader, *sub;
2526 u64 nr = counter->nr_siblings;
2527
2528 perf_output_put(&handle, nr);
2529
2530 leader = counter->group_leader;
2531 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2532 if (sub != counter)
2533 sub->pmu->read(sub);
2534 2948
2535 group_entry.id = sub->id; 2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2536 group_entry.counter = atomic64_read(&sub->count); 2950 if (callchain)
2951 perf_output_copy(&handle, callchain, callchain_size);
2952 else {
2953 u64 nr = 0;
2954 perf_output_put(&handle, nr);
2955 }
2956 }
2537 2957
2538 perf_output_put(&handle, group_entry); 2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2539 } 2971 }
2540 } 2972 }
2541 2973
2542 if (callchain) 2974 perf_output_end(&handle);
2543 perf_output_copy(&handle, callchain, callchain_size); 2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
2544 3010
2545 perf_output_end(&handle); 3011 perf_output_end(&handle);
2546} 3012}
2547 3013
2548/* 3014/*
2549 * fork tracking 3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
2550 */ 3018 */
2551 3019
2552struct perf_fork_event { 3020struct perf_task_event {
2553 struct task_struct *task; 3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
2554 3023
2555 struct { 3024 struct {
2556 struct perf_event_header header; 3025 struct perf_event_header header;
2557 3026
2558 u32 pid; 3027 u32 pid;
2559 u32 ppid; 3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
2560 } event; 3031 } event;
2561}; 3032};
2562 3033
2563static void perf_counter_fork_output(struct perf_counter *counter, 3034static void perf_counter_task_output(struct perf_counter *counter,
2564 struct perf_fork_event *fork_event) 3035 struct perf_task_event *task_event)
2565{ 3036{
2566 struct perf_output_handle handle; 3037 struct perf_output_handle handle;
2567 int size = fork_event->event.header.size; 3038 int size = task_event->event.header.size;
2568 struct task_struct *task = fork_event->task; 3039 struct task_struct *task = task_event->task;
2569 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2570 3041
2571 if (ret) 3042 if (ret)
2572 return; 3043 return;
2573 3044
2574 fork_event->event.pid = perf_counter_pid(counter, task); 3045 task_event->event.pid = perf_counter_pid(counter, task);
2575 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
2576 3050
2577 perf_output_put(&handle, fork_event->event); 3051 perf_output_put(&handle, task_event->event);
2578 perf_output_end(&handle); 3052 perf_output_end(&handle);
2579} 3053}
2580 3054
2581static int perf_counter_fork_match(struct perf_counter *counter) 3055static int perf_counter_task_match(struct perf_counter *counter)
2582{ 3056{
2583 if (counter->attr.comm || counter->attr.mmap) 3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2584 return 1; 3058 return 1;
2585 3059
2586 return 0; 3060 return 0;
2587} 3061}
2588 3062
2589static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2590 struct perf_fork_event *fork_event) 3064 struct perf_task_event *task_event)
2591{ 3065{
2592 struct perf_counter *counter; 3066 struct perf_counter *counter;
2593 3067
@@ -2596,51 +3070,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2596 3070
2597 rcu_read_lock(); 3071 rcu_read_lock();
2598 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2599 if (perf_counter_fork_match(counter)) 3073 if (perf_counter_task_match(counter))
2600 perf_counter_fork_output(counter, fork_event); 3074 perf_counter_task_output(counter, task_event);
2601 } 3075 }
2602 rcu_read_unlock(); 3076 rcu_read_unlock();
2603} 3077}
2604 3078
2605static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3079static void perf_counter_task_event(struct perf_task_event *task_event)
2606{ 3080{
2607 struct perf_cpu_context *cpuctx; 3081 struct perf_cpu_context *cpuctx;
2608 struct perf_counter_context *ctx; 3082 struct perf_counter_context *ctx = task_event->task_ctx;
2609 3083
2610 cpuctx = &get_cpu_var(perf_cpu_context); 3084 cpuctx = &get_cpu_var(perf_cpu_context);
2611 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2612 put_cpu_var(perf_cpu_context); 3086 put_cpu_var(perf_cpu_context);
2613 3087
2614 rcu_read_lock(); 3088 rcu_read_lock();
2615 /* 3089 if (!ctx)
2616 * doesn't really matter which of the child contexts the 3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2617 * events ends up in.
2618 */
2619 ctx = rcu_dereference(current->perf_counter_ctxp);
2620 if (ctx) 3091 if (ctx)
2621 perf_counter_fork_ctx(ctx, fork_event); 3092 perf_counter_task_ctx(ctx, task_event);
2622 rcu_read_unlock(); 3093 rcu_read_unlock();
2623} 3094}
2624 3095
2625void perf_counter_fork(struct task_struct *task) 3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
2626{ 3099{
2627 struct perf_fork_event fork_event; 3100 struct perf_task_event task_event;
2628 3101
2629 if (!atomic_read(&nr_comm_counters) && 3102 if (!atomic_read(&nr_comm_counters) &&
2630 !atomic_read(&nr_mmap_counters)) 3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
2631 return; 3105 return;
2632 3106
2633 fork_event = (struct perf_fork_event){ 3107 task_event = (struct perf_task_event){
2634 .task = task, 3108 .task = task,
2635 .event = { 3109 .task_ctx = task_ctx,
3110 .event = {
2636 .header = { 3111 .header = {
2637 .type = PERF_EVENT_FORK, 3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2638 .size = sizeof(fork_event.event), 3113 .misc = 0,
3114 .size = sizeof(task_event.event),
2639 }, 3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
2640 }, 3120 },
2641 }; 3121 };
2642 3122
2643 perf_counter_fork_event(&fork_event); 3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
2644} 3129}
2645 3130
2646/* 3131/*
@@ -2708,8 +3193,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2708 struct perf_cpu_context *cpuctx; 3193 struct perf_cpu_context *cpuctx;
2709 struct perf_counter_context *ctx; 3194 struct perf_counter_context *ctx;
2710 unsigned int size; 3195 unsigned int size;
2711 char *comm = comm_event->task->comm; 3196 char comm[TASK_COMM_LEN];
2712 3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
2713 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
2714 3201
2715 comm_event->comm = comm; 3202 comm_event->comm = comm;
@@ -2736,13 +3223,24 @@ void perf_counter_comm(struct task_struct *task)
2736{ 3223{
2737 struct perf_comm_event comm_event; 3224 struct perf_comm_event comm_event;
2738 3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
2739 if (!atomic_read(&nr_comm_counters)) 3229 if (!atomic_read(&nr_comm_counters))
2740 return; 3230 return;
2741 3231
2742 comm_event = (struct perf_comm_event){ 3232 comm_event = (struct perf_comm_event){
2743 .task = task, 3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
2744 .event = { 3236 .event = {
2745 .header = { .type = PERF_EVENT_COMM, }, 3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
2746 }, 3244 },
2747 }; 3245 };
2748 3246
@@ -2825,8 +3323,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2825 char *buf = NULL; 3323 char *buf = NULL;
2826 const char *name; 3324 const char *name;
2827 3325
3326 memset(tmp, 0, sizeof(tmp));
3327
2828 if (file) { 3328 if (file) {
2829 buf = kzalloc(PATH_MAX, GFP_KERNEL); 3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
2830 if (!buf) { 3335 if (!buf) {
2831 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
2832 goto got_name; 3337 goto got_name;
@@ -2837,9 +3342,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2837 goto got_name; 3342 goto got_name;
2838 } 3343 }
2839 } else { 3344 } else {
2840 name = arch_vma_name(mmap_event->vma); 3345 if (arch_vma_name(mmap_event->vma)) {
2841 if (name) 3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
2842 goto got_name; 3348 goto got_name;
3349 }
2843 3350
2844 if (!vma->vm_mm) { 3351 if (!vma->vm_mm) {
2845 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@ -2884,8 +3391,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2884 3391
2885 mmap_event = (struct perf_mmap_event){ 3392 mmap_event = (struct perf_mmap_event){
2886 .vma = vma, 3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
2887 .event = { 3396 .event = {
2888 .header = { .type = PERF_EVENT_MMAP, }, 3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
2889 .start = vma->vm_start, 3404 .start = vma->vm_start,
2890 .len = vma->vm_end - vma->vm_start, 3405 .len = vma->vm_end - vma->vm_start,
2891 .pgoff = vma->vm_pgoff, 3406 .pgoff = vma->vm_pgoff,
@@ -2896,49 +3411,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
2896} 3411}
2897 3412
2898/* 3413/*
2899 * Log sample_period changes so that analyzing tools can re-normalize the
2900 * event flow.
2901 */
2902
2903struct freq_event {
2904 struct perf_event_header header;
2905 u64 time;
2906 u64 id;
2907 u64 period;
2908};
2909
2910static void perf_log_period(struct perf_counter *counter, u64 period)
2911{
2912 struct perf_output_handle handle;
2913 struct freq_event event;
2914 int ret;
2915
2916 if (counter->hw.sample_period == period)
2917 return;
2918
2919 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2920 return;
2921
2922 event = (struct freq_event) {
2923 .header = {
2924 .type = PERF_EVENT_PERIOD,
2925 .misc = 0,
2926 .size = sizeof(event),
2927 },
2928 .time = sched_clock(),
2929 .id = counter->id,
2930 .period = period,
2931 };
2932
2933 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2934 if (ret)
2935 return;
2936
2937 perf_output_put(&handle, event);
2938 perf_output_end(&handle);
2939}
2940
2941/*
2942 * IRQ throttle logging 3414 * IRQ throttle logging
2943 */ 3415 */
2944 3416
@@ -2951,16 +3423,21 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2951 struct perf_event_header header; 3423 struct perf_event_header header;
2952 u64 time; 3424 u64 time;
2953 u64 id; 3425 u64 id;
3426 u64 stream_id;
2954 } throttle_event = { 3427 } throttle_event = {
2955 .header = { 3428 .header = {
2956 .type = PERF_EVENT_THROTTLE + 1, 3429 .type = PERF_EVENT_THROTTLE,
2957 .misc = 0, 3430 .misc = 0,
2958 .size = sizeof(throttle_event), 3431 .size = sizeof(throttle_event),
2959 }, 3432 },
2960 .time = sched_clock(), 3433 .time = sched_clock(),
2961 .id = counter->id, 3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
2962 }; 3436 };
2963 3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
2964 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); 3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
2965 if (ret) 3442 if (ret)
2966 return; 3443 return;
@@ -2970,7 +3447,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
2970} 3447}
2971 3448
2972/* 3449/*
2973 * Generic counter overflow handling. 3450 * Generic counter overflow handling, sampling.
2974 */ 3451 */
2975 3452
2976int perf_counter_overflow(struct perf_counter *counter, int nmi, 3453int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -3037,130 +3514,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3037 * Generic software counter infrastructure 3514 * Generic software counter infrastructure
3038 */ 3515 */
3039 3516
3040static void perf_swcounter_update(struct perf_counter *counter) 3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3041{ 3525{
3042 struct hw_perf_counter *hwc = &counter->hw; 3526 struct hw_perf_counter *hwc = &counter->hw;
3043 u64 prev, now; 3527 u64 period = hwc->last_period;
3044 s64 delta; 3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3045 3532
3046again: 3533again:
3047 prev = atomic64_read(&hwc->prev_count); 3534 old = val = atomic64_read(&hwc->period_left);
3048 now = atomic64_read(&hwc->count); 3535 if (val < 0)
3049 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3536 return 0;
3050 goto again;
3051 3537
3052 delta = now - prev; 3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3053 3543
3054 atomic64_add(delta, &counter->count); 3544 return nr;
3055 atomic64_sub(delta, &hwc->period_left);
3056} 3545}
3057 3546
3058static void perf_swcounter_set_period(struct perf_counter *counter) 3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3059{ 3549{
3060 struct hw_perf_counter *hwc = &counter->hw; 3550 struct hw_perf_counter *hwc = &counter->hw;
3061 s64 left = atomic64_read(&hwc->period_left); 3551 u64 overflow;
3062 s64 period = hwc->sample_period;
3063 3552
3064 if (unlikely(left <= -period)) { 3553 data->period = counter->hw.last_period;
3065 left = period; 3554 overflow = perf_swcounter_set_period(counter);
3066 atomic64_set(&hwc->period_left, left);
3067 hwc->last_period = period;
3068 }
3069 3555
3070 if (unlikely(left <= 0)) { 3556 if (hwc->interrupts == MAX_INTERRUPTS)
3071 left += period; 3557 return;
3072 atomic64_add(period, &hwc->period_left);
3073 hwc->last_period = period;
3074 }
3075 3558
3076 atomic64_set(&hwc->prev_count, -left); 3559 for (; overflow; overflow--) {
3077 atomic64_set(&hwc->count, -left); 3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3078} 3568}
3079 3569
3080static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3081{ 3571{
3082 enum hrtimer_restart ret = HRTIMER_RESTART;
3083 struct perf_sample_data data;
3084 struct perf_counter *counter;
3085 u64 period;
3086
3087 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3088 counter->pmu->read(counter);
3089
3090 data.addr = 0;
3091 data.regs = get_irq_regs();
3092 /* 3572 /*
3093 * In case we exclude kernel IPs or are somehow not in interrupt 3573 * Nothing to do, we already reset hwc->interrupts.
3094 * context, provide the next best thing, the user IP.
3095 */ 3574 */
3096 if ((counter->attr.exclude_kernel || !data.regs) &&
3097 !counter->attr.exclude_user)
3098 data.regs = task_pt_regs(current);
3099
3100 if (data.regs) {
3101 if (perf_counter_overflow(counter, 0, &data))
3102 ret = HRTIMER_NORESTART;
3103 }
3104
3105 period = max_t(u64, 10000, counter->hw.sample_period);
3106 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3107
3108 return ret;
3109} 3575}
3110 3576
3111static void perf_swcounter_overflow(struct perf_counter *counter, 3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3112 int nmi, struct pt_regs *regs, u64 addr) 3578 int nmi, struct perf_sample_data *data)
3113{ 3579{
3114 struct perf_sample_data data = { 3580 struct hw_perf_counter *hwc = &counter->hw;
3115 .regs = regs, 3581
3116 .addr = addr, 3582 atomic64_add(nr, &counter->count);
3117 .period = counter->hw.last_period,
3118 };
3119 3583
3120 perf_swcounter_update(counter); 3584 if (!hwc->sample_period)
3121 perf_swcounter_set_period(counter); 3585 return;
3122 if (perf_counter_overflow(counter, nmi, &data)) 3586
3123 /* soft-disable the counter */ 3587 if (!data->regs)
3124 ; 3588 return;
3125 3589
3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3591 perf_swcounter_overflow(counter, nmi, data);
3126} 3592}
3127 3593
3128static int perf_swcounter_is_counting(struct perf_counter *counter) 3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3129{ 3595{
3130 struct perf_counter_context *ctx; 3596 /*
3131 unsigned long flags; 3597 * The counter is active, we're good!
3132 int count; 3598 */
3133
3134 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3135 return 1; 3600 return 1;
3136 3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3137 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3138 return 0; 3606 return 0;
3139 3607
3140 /* 3608 /*
3141 * If the counter is inactive, it could be just because 3609 * The counter is inactive, if the context is active
3142 * its task is scheduled out, or because it's in a group 3610 * we're part of a group that didn't make it on the 'pmu',
3143 * which could not go on the PMU. We want to count in 3611 * not counting.
3144 * the first case but not the second. If the context is
3145 * currently active then an inactive software counter must
3146 * be the second case. If it's not currently active then
3147 * we need to know whether the counter was active when the
3148 * context was last active, which we can determine by
3149 * comparing counter->tstamp_stopped with ctx->time.
3150 *
3151 * We are within an RCU read-side critical section,
3152 * which protects the existence of *ctx.
3153 */ 3612 */
3154 ctx = counter->ctx; 3613 if (counter->ctx->is_active)
3155 spin_lock_irqsave(&ctx->lock, flags); 3614 return 0;
3156 count = 1; 3615
3157 /* Re-check state now we have the lock */ 3616 /*
3158 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3617 * We're inactive and the context is too, this means the
3159 counter->ctx->is_active || 3618 * task is scheduled out, we're counting events that happen
3160 counter->tstamp_stopped < ctx->time) 3619 * to us, like migration events.
3161 count = 0; 3620 */
3162 spin_unlock_irqrestore(&ctx->lock, flags); 3621 return 1;
3163 return count;
3164} 3622}
3165 3623
3166static int perf_swcounter_match(struct perf_counter *counter, 3624static int perf_swcounter_match(struct perf_counter *counter,
@@ -3186,19 +3644,10 @@ static int perf_swcounter_match(struct perf_counter *counter,
3186 return 1; 3644 return 1;
3187} 3645}
3188 3646
3189static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3190 int nmi, struct pt_regs *regs, u64 addr)
3191{
3192 int neg = atomic64_add_negative(nr, &counter->hw.count);
3193
3194 if (counter->hw.sample_period && !neg && regs)
3195 perf_swcounter_overflow(counter, nmi, regs, addr);
3196}
3197
3198static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3199 enum perf_type_id type, u32 event, 3648 enum perf_type_id type,
3200 u64 nr, int nmi, struct pt_regs *regs, 3649 u32 event, u64 nr, int nmi,
3201 u64 addr) 3650 struct perf_sample_data *data)
3202{ 3651{
3203 struct perf_counter *counter; 3652 struct perf_counter *counter;
3204 3653
@@ -3207,8 +3656,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3207 3656
3208 rcu_read_lock(); 3657 rcu_read_lock();
3209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3658 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3210 if (perf_swcounter_match(counter, type, event, regs)) 3659 if (perf_swcounter_match(counter, type, event, data->regs))
3211 perf_swcounter_add(counter, nr, nmi, regs, addr); 3660 perf_swcounter_add(counter, nr, nmi, data);
3212 } 3661 }
3213 rcu_read_unlock(); 3662 rcu_read_unlock();
3214} 3663}
@@ -3227,9 +3676,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3227 return &cpuctx->recursion[0]; 3676 return &cpuctx->recursion[0];
3228} 3677}
3229 3678
3230static void __perf_swcounter_event(enum perf_type_id type, u32 event, 3679static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3231 u64 nr, int nmi, struct pt_regs *regs, 3680 u64 nr, int nmi,
3232 u64 addr) 3681 struct perf_sample_data *data)
3233{ 3682{
3234 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3683 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3235 int *recursion = perf_swcounter_recursion_context(cpuctx); 3684 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3242,7 +3691,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3242 barrier(); 3691 barrier();
3243 3692
3244 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3693 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3245 nr, nmi, regs, addr); 3694 nr, nmi, data);
3246 rcu_read_lock(); 3695 rcu_read_lock();
3247 /* 3696 /*
3248 * doesn't really matter which of the child contexts the 3697 * doesn't really matter which of the child contexts the
@@ -3250,7 +3699,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event,
3250 */ 3699 */
3251 ctx = rcu_dereference(current->perf_counter_ctxp); 3700 ctx = rcu_dereference(current->perf_counter_ctxp);
3252 if (ctx) 3701 if (ctx)
3253 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); 3702 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3254 rcu_read_unlock(); 3703 rcu_read_unlock();
3255 3704
3256 barrier(); 3705 barrier();
@@ -3260,35 +3709,79 @@ out:
3260 put_cpu_var(perf_cpu_context); 3709 put_cpu_var(perf_cpu_context);
3261} 3710}
3262 3711
3263void 3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3264perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) 3713 struct pt_regs *regs, u64 addr)
3265{ 3714{
3266 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); 3715 struct perf_sample_data data = {
3716 .regs = regs,
3717 .addr = addr,
3718 };
3719
3720 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3267} 3721}
3268 3722
3269static void perf_swcounter_read(struct perf_counter *counter) 3723static void perf_swcounter_read(struct perf_counter *counter)
3270{ 3724{
3271 perf_swcounter_update(counter);
3272} 3725}
3273 3726
3274static int perf_swcounter_enable(struct perf_counter *counter) 3727static int perf_swcounter_enable(struct perf_counter *counter)
3275{ 3728{
3276 perf_swcounter_set_period(counter); 3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3277 return 0; 3735 return 0;
3278} 3736}
3279 3737
3280static void perf_swcounter_disable(struct perf_counter *counter) 3738static void perf_swcounter_disable(struct perf_counter *counter)
3281{ 3739{
3282 perf_swcounter_update(counter);
3283} 3740}
3284 3741
3285static const struct pmu perf_ops_generic = { 3742static const struct pmu perf_ops_generic = {
3286 .enable = perf_swcounter_enable, 3743 .enable = perf_swcounter_enable,
3287 .disable = perf_swcounter_disable, 3744 .disable = perf_swcounter_disable,
3288 .read = perf_swcounter_read, 3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3289}; 3747};
3290 3748
3291/* 3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3292 * Software counter: cpu wall time clock 3785 * Software counter: cpu wall time clock
3293 */ 3786 */
3294 3787
@@ -3404,36 +3897,25 @@ static const struct pmu perf_ops_task_clock = {
3404 .read = task_clock_perf_counter_read, 3897 .read = task_clock_perf_counter_read,
3405}; 3898};
3406 3899
3407/*
3408 * Software counter: cpu migrations
3409 */
3410void perf_counter_task_migration(struct task_struct *task, int cpu)
3411{
3412 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3413 struct perf_counter_context *ctx;
3414
3415 perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
3416 PERF_COUNT_SW_CPU_MIGRATIONS,
3417 1, 1, NULL, 0);
3418
3419 ctx = perf_pin_task_context(task);
3420 if (ctx) {
3421 perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
3422 PERF_COUNT_SW_CPU_MIGRATIONS,
3423 1, 1, NULL, 0);
3424 perf_unpin_context(ctx);
3425 }
3426}
3427
3428#ifdef CONFIG_EVENT_PROFILE 3900#ifdef CONFIG_EVENT_PROFILE
3429void perf_tpcounter_event(int event_id) 3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3430{ 3903{
3431 struct pt_regs *regs = get_irq_regs(); 3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3909 struct perf_sample_data data = {
3910 .regs = get_irq_regs(),
3911 .addr = addr,
3912 .raw = &raw,
3913 };
3432 3914
3433 if (!regs) 3915 if (!data.regs)
3434 regs = task_pt_regs(current); 3916 data.regs = task_pt_regs(current);
3435 3917
3436 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); 3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3437} 3919}
3438EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3439 3921
@@ -3442,16 +3924,20 @@ extern void ftrace_profile_disable(int);
3442 3924
3443static void tp_perf_counter_destroy(struct perf_counter *counter) 3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3444{ 3926{
3445 ftrace_profile_disable(perf_event_id(&counter->attr)); 3927 ftrace_profile_disable(counter->attr.config);
3446} 3928}
3447 3929
3448static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3449{ 3931{
3450 int event_id = perf_event_id(&counter->attr); 3932 /*
3451 int ret; 3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3452 3939
3453 ret = ftrace_profile_enable(event_id); 3940 if (ftrace_profile_enable(counter->attr.config))
3454 if (ret)
3455 return NULL; 3941 return NULL;
3456 3942
3457 counter->destroy = tp_perf_counter_destroy; 3943 counter->destroy = tp_perf_counter_destroy;
@@ -3465,9 +3951,21 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3465} 3951}
3466#endif 3952#endif
3467 3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3468static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) 3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3469{ 3966{
3470 const struct pmu *pmu = NULL; 3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3471 3969
3472 /* 3970 /*
3473 * Software counters (currently) can't in general distinguish 3971 * Software counters (currently) can't in general distinguish
@@ -3476,7 +3974,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3476 * to be kernel events, and page faults are never hypervisor 3974 * to be kernel events, and page faults are never hypervisor
3477 * events. 3975 * events.
3478 */ 3976 */
3479 switch (counter->attr.config) { 3977 switch (event) {
3480 case PERF_COUNT_SW_CPU_CLOCK: 3978 case PERF_COUNT_SW_CPU_CLOCK:
3481 pmu = &perf_ops_cpu_clock; 3979 pmu = &perf_ops_cpu_clock;
3482 3980
@@ -3497,6 +3995,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3497 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3498 case PERF_COUNT_SW_CONTEXT_SWITCHES: 3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3499 case PERF_COUNT_SW_CPU_MIGRATIONS: 3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
3500 pmu = &perf_ops_generic; 4002 pmu = &perf_ops_generic;
3501 break; 4003 break;
3502 } 4004 }
@@ -3512,6 +4014,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3512 int cpu, 4014 int cpu,
3513 struct perf_counter_context *ctx, 4015 struct perf_counter_context *ctx,
3514 struct perf_counter *group_leader, 4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
3515 gfp_t gfpflags) 4018 gfp_t gfpflags)
3516{ 4019{
3517 const struct pmu *pmu; 4020 const struct pmu *pmu;
@@ -3547,6 +4050,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3547 counter->ctx = ctx; 4050 counter->ctx = ctx;
3548 counter->oncpu = -1; 4051 counter->oncpu = -1;
3549 4052
4053 counter->parent = parent_counter;
4054
3550 counter->ns = get_pid_ns(current->nsproxy->pid_ns); 4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3551 counter->id = atomic64_inc_return(&perf_counter_id); 4056 counter->id = atomic64_inc_return(&perf_counter_id);
3552 4057
@@ -3561,13 +4066,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3561 hwc->sample_period = attr->sample_period; 4066 hwc->sample_period = attr->sample_period;
3562 if (attr->freq && attr->sample_freq) 4067 if (attr->freq && attr->sample_freq)
3563 hwc->sample_period = 1; 4068 hwc->sample_period = 1;
4069 hwc->last_period = hwc->sample_period;
3564 4070
3565 atomic64_set(&hwc->period_left, hwc->sample_period); 4071 atomic64_set(&hwc->period_left, hwc->sample_period);
3566 4072
3567 /* 4073 /*
3568 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4074 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3569 */ 4075 */
3570 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4076 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3571 goto done; 4077 goto done;
3572 4078
3573 switch (attr->type) { 4079 switch (attr->type) {
@@ -3604,11 +4110,15 @@ done:
3604 4110
3605 counter->pmu = pmu; 4111 counter->pmu = pmu;
3606 4112
3607 atomic_inc(&nr_counters); 4113 if (!counter->parent) {
3608 if (counter->attr.mmap) 4114 atomic_inc(&nr_counters);
3609 atomic_inc(&nr_mmap_counters); 4115 if (counter->attr.mmap)
3610 if (counter->attr.comm) 4116 atomic_inc(&nr_mmap_counters);
3611 atomic_inc(&nr_comm_counters); 4117 if (counter->attr.comm)
4118 atomic_inc(&nr_comm_counters);
4119 if (counter->attr.task)
4120 atomic_inc(&nr_task_counters);
4121 }
3612 4122
3613 return counter; 4123 return counter;
3614} 4124}
@@ -3771,7 +4281,7 @@ SYSCALL_DEFINE5(perf_counter_open,
3771 } 4281 }
3772 4282
3773 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3774 GFP_KERNEL); 4284 NULL, GFP_KERNEL);
3775 ret = PTR_ERR(counter); 4285 ret = PTR_ERR(counter);
3776 if (IS_ERR(counter)) 4286 if (IS_ERR(counter))
3777 goto err_put_context; 4287 goto err_put_context;
@@ -3837,7 +4347,8 @@ inherit_counter(struct perf_counter *parent_counter,
3837 4347
3838 child_counter = perf_counter_alloc(&parent_counter->attr, 4348 child_counter = perf_counter_alloc(&parent_counter->attr,
3839 parent_counter->cpu, child_ctx, 4349 parent_counter->cpu, child_ctx,
3840 group_leader, GFP_KERNEL); 4350 group_leader, parent_counter,
4351 GFP_KERNEL);
3841 if (IS_ERR(child_counter)) 4352 if (IS_ERR(child_counter))
3842 return child_counter; 4353 return child_counter;
3843 get_ctx(child_ctx); 4354 get_ctx(child_ctx);
@@ -3860,12 +4371,6 @@ inherit_counter(struct perf_counter *parent_counter,
3860 */ 4371 */
3861 add_counter_to_ctx(child_counter, child_ctx); 4372 add_counter_to_ctx(child_counter, child_ctx);
3862 4373
3863 child_counter->parent = parent_counter;
3864 /*
3865 * inherit into child's child as well:
3866 */
3867 child_counter->attr.inherit = 1;
3868
3869 /* 4374 /*
3870 * Get a reference to the parent filp - we will fput it 4375 * Get a reference to the parent filp - we will fput it
3871 * when the child counter exits. This is safe to do because 4376 * when the child counter exits. This is safe to do because
@@ -3909,10 +4414,14 @@ static int inherit_group(struct perf_counter *parent_counter,
3909} 4414}
3910 4415
3911static void sync_child_counter(struct perf_counter *child_counter, 4416static void sync_child_counter(struct perf_counter *child_counter,
3912 struct perf_counter *parent_counter) 4417 struct task_struct *child)
3913{ 4418{
4419 struct perf_counter *parent_counter = child_counter->parent;
3914 u64 child_val; 4420 u64 child_val;
3915 4421
4422 if (child_counter->attr.inherit_stat)
4423 perf_counter_read_event(child_counter, child);
4424
3916 child_val = atomic64_read(&child_counter->count); 4425 child_val = atomic64_read(&child_counter->count);
3917 4426
3918 /* 4427 /*
@@ -3941,7 +4450,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
3941 4450
3942static void 4451static void
3943__perf_counter_exit_task(struct perf_counter *child_counter, 4452__perf_counter_exit_task(struct perf_counter *child_counter,
3944 struct perf_counter_context *child_ctx) 4453 struct perf_counter_context *child_ctx,
4454 struct task_struct *child)
3945{ 4455{
3946 struct perf_counter *parent_counter; 4456 struct perf_counter *parent_counter;
3947 4457
@@ -3955,7 +4465,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
3955 * counters need to be zapped - but otherwise linger. 4465 * counters need to be zapped - but otherwise linger.
3956 */ 4466 */
3957 if (parent_counter) { 4467 if (parent_counter) {
3958 sync_child_counter(child_counter, parent_counter); 4468 sync_child_counter(child_counter, child);
3959 free_counter(child_counter); 4469 free_counter(child_counter);
3960 } 4470 }
3961} 4471}
@@ -3969,8 +4479,10 @@ void perf_counter_exit_task(struct task_struct *child)
3969 struct perf_counter_context *child_ctx; 4479 struct perf_counter_context *child_ctx;
3970 unsigned long flags; 4480 unsigned long flags;
3971 4481
3972 if (likely(!child->perf_counter_ctxp)) 4482 if (likely(!child->perf_counter_ctxp)) {
4483 perf_counter_task(child, NULL, 0);
3973 return; 4484 return;
4485 }
3974 4486
3975 local_irq_save(flags); 4487 local_irq_save(flags);
3976 /* 4488 /*
@@ -3989,17 +4501,20 @@ void perf_counter_exit_task(struct task_struct *child)
3989 */ 4501 */
3990 spin_lock(&child_ctx->lock); 4502 spin_lock(&child_ctx->lock);
3991 child->perf_counter_ctxp = NULL; 4503 child->perf_counter_ctxp = NULL;
3992 if (child_ctx->parent_ctx) { 4504 /*
3993 /* 4505 * If this context is a clone; unclone it so it can't get
3994 * This context is a clone; unclone it so it can't get 4506 * swapped to another process while we're removing all
3995 * swapped to another process while we're removing all 4507 * the counters from it.
3996 * the counters from it. 4508 */
3997 */ 4509 unclone_ctx(child_ctx);
3998 put_ctx(child_ctx->parent_ctx); 4510 spin_unlock_irqrestore(&child_ctx->lock, flags);
3999 child_ctx->parent_ctx = NULL; 4511
4000 } 4512 /*
4001 spin_unlock(&child_ctx->lock); 4513 * Report the task dead after unscheduling the counters so that we
4002 local_irq_restore(flags); 4514 * won't get any samples after PERF_EVENT_EXIT. We can however still
4515 * get a few PERF_EVENT_READ events.
4516 */
4517 perf_counter_task(child, child_ctx, 0);
4003 4518
4004 /* 4519 /*
4005 * We can recurse on the same lock type through: 4520 * We can recurse on the same lock type through:
@@ -4017,7 +4532,7 @@ void perf_counter_exit_task(struct task_struct *child)
4017again: 4532again:
4018 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, 4533 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4019 list_entry) 4534 list_entry)
4020 __perf_counter_exit_task(child_counter, child_ctx); 4535 __perf_counter_exit_task(child_counter, child_ctx, child);
4021 4536
4022 /* 4537 /*
4023 * If the last counter was a group counter, it will have appended all 4538 * If the last counter was a group counter, it will have appended all
@@ -4220,6 +4735,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4220 perf_counter_init_cpu(cpu); 4735 perf_counter_init_cpu(cpu);
4221 break; 4736 break;
4222 4737
4738 case CPU_ONLINE:
4739 case CPU_ONLINE_FROZEN:
4740 hw_perf_counter_setup_online(cpu);
4741 break;
4742
4223 case CPU_DOWN_PREPARE: 4743 case CPU_DOWN_PREPARE:
4224 case CPU_DOWN_PREPARE_FROZEN: 4744 case CPU_DOWN_PREPARE_FROZEN:
4225 perf_counter_exit_cpu(cpu); 4745 perf_counter_exit_cpu(cpu);
@@ -4244,6 +4764,8 @@ void __init perf_counter_init(void)
4244{ 4764{
4245 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4765 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4246 (void *)(long)smp_processor_id()); 4766 (void *)(long)smp_processor_id());
4767 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4768 (void *)(long)smp_processor_id());
4247 register_cpu_notifier(&perf_cpu_nb); 4769 register_cpu_notifier(&perf_cpu_nb);
4248} 4770}
4249 4771