aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c853
1 files changed, 606 insertions, 247 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 950931041954..e0d91fdf0c3c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,14 +42,21 @@ static int perf_overcommit __read_mostly = 1;
42static atomic_t nr_counters __read_mostly; 42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly; 43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly; 44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
45 46
46/* 47/*
47 * perf counter paranoia level: 48 * perf counter paranoia level:
48 * 0 - not paranoid 49 * -1 - not paranoid at all
49 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
50 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
51 */ 53 */
52int sysctl_perf_counter_paranoid __read_mostly; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
53 60
54static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
55{ 62{
@@ -87,6 +94,7 @@ void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); } 94void __weak hw_perf_enable(void) { barrier(); }
88 95
89void __weak hw_perf_counter_setup(int cpu) { barrier(); } 96void __weak hw_perf_counter_setup(int cpu) { barrier(); }
97void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
90 98
91int __weak 99int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader, 100hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -305,6 +313,10 @@ counter_sched_out(struct perf_counter *counter,
305 return; 313 return;
306 314
307 counter->state = PERF_COUNTER_STATE_INACTIVE; 315 counter->state = PERF_COUNTER_STATE_INACTIVE;
316 if (counter->pending_disable) {
317 counter->pending_disable = 0;
318 counter->state = PERF_COUNTER_STATE_OFF;
319 }
308 counter->tstamp_stopped = ctx->time; 320 counter->tstamp_stopped = ctx->time;
309 counter->pmu->disable(counter); 321 counter->pmu->disable(counter);
310 counter->oncpu = -1; 322 counter->oncpu = -1;
@@ -463,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
463 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
464 u64 run_end; 476 u64 run_end;
465 477
466 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
467 return; 480 return;
468 481
469 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -512,7 +525,7 @@ static void __perf_counter_disable(void *info)
512 */ 525 */
513 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
514 update_context_time(ctx); 527 update_context_time(ctx);
515 update_counter_times(counter); 528 update_group_times(counter);
516 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
517 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
518 else 531 else
@@ -567,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
567 * in, so we can change the state safely. 580 * in, so we can change the state safely.
568 */ 581 */
569 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
570 update_counter_times(counter); 583 update_group_times(counter);
571 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
572 } 585 }
573 586
@@ -845,6 +858,27 @@ retry:
845} 858}
846 859
847/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
848 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
849 */ 883 */
850static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -871,8 +905,7 @@ static void __perf_counter_enable(void *info)
871 905
872 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
873 goto unlock; 907 goto unlock;
874 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
875 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
876 909
877 /* 910 /*
878 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -965,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
965 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
966 * in, so we can change the state safely. 999 * in, so we can change the state safely.
967 */ 1000 */
968 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
969 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
970 counter->tstamp_enabled = 1003
971 ctx->time - counter->total_time_enabled;
972 }
973 out: 1004 out:
974 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
975} 1006}
@@ -1103,7 +1134,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1103 __perf_counter_sync_stat(counter, next_counter); 1134 __perf_counter_sync_stat(counter, next_counter);
1104 1135
1105 counter = list_next_entry(counter, event_entry); 1136 counter = list_next_entry(counter, event_entry);
1106 next_counter = list_next_entry(counter, event_entry); 1137 next_counter = list_next_entry(next_counter, event_entry);
1107 } 1138 }
1108} 1139}
1109 1140
@@ -1473,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1473 counter->attr.enable_on_exec = 0; 1504 counter->attr.enable_on_exec = 0;
1474 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1475 continue; 1506 continue;
1476 counter->state = PERF_COUNTER_STATE_INACTIVE; 1507 __perf_counter_mark_enabled(counter, ctx);
1477 counter->tstamp_enabled =
1478 ctx->time - counter->total_time_enabled;
1479 enabled = 1; 1508 enabled = 1;
1480 } 1509 }
1481 1510
@@ -1497,10 +1526,21 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1497 */ 1526 */
1498static void __perf_counter_read(void *info) 1527static void __perf_counter_read(void *info)
1499{ 1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1500 struct perf_counter *counter = info; 1530 struct perf_counter *counter = info;
1501 struct perf_counter_context *ctx = counter->ctx; 1531 struct perf_counter_context *ctx = counter->ctx;
1502 unsigned long flags; 1532 unsigned long flags;
1503 1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1504 local_irq_save(flags); 1544 local_irq_save(flags);
1505 if (ctx->is_active) 1545 if (ctx->is_active)
1506 update_context_time(ctx); 1546 update_context_time(ctx);
@@ -1654,6 +1694,13 @@ static void free_counter(struct perf_counter *counter)
1654 atomic_dec(&nr_mmap_counters); 1694 atomic_dec(&nr_mmap_counters);
1655 if (counter->attr.comm) 1695 if (counter->attr.comm)
1656 atomic_dec(&nr_comm_counters); 1696 atomic_dec(&nr_comm_counters);
1697 if (counter->attr.task)
1698 atomic_dec(&nr_task_counters);
1699 }
1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1657 } 1704 }
1658 1705
1659 if (counter->destroy) 1706 if (counter->destroy)
@@ -1688,14 +1735,133 @@ static int perf_release(struct inode *inode, struct file *file)
1688 return 0; 1735 return 0;
1689} 1736}
1690 1737
1738static int perf_counter_read_size(struct perf_counter *counter)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (counter->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += counter->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
1763static u64 perf_counter_read_value(struct perf_counter *counter)
1764{
1765 struct perf_counter *child;
1766 u64 total = 0;
1767
1768 total += perf_counter_read(counter);
1769 list_for_each_entry(child, &counter->child_list, child_list)
1770 total += perf_counter_read(child);
1771
1772 return total;
1773}
1774
1775static int perf_counter_read_entry(struct perf_counter *counter,
1776 u64 read_format, char __user *buf)
1777{
1778 int n = 0, count = 0;
1779 u64 values[2];
1780
1781 values[n++] = perf_counter_read_value(counter);
1782 if (read_format & PERF_FORMAT_ID)
1783 values[n++] = primary_counter_id(counter);
1784
1785 count = n * sizeof(u64);
1786
1787 if (copy_to_user(buf, values, count))
1788 return -EFAULT;
1789
1790 return count;
1791}
1792
1793static int perf_counter_read_group(struct perf_counter *counter,
1794 u64 read_format, char __user *buf)
1795{
1796 struct perf_counter *leader = counter->group_leader, *sub;
1797 int n = 0, size = 0, err = -EFAULT;
1798 u64 values[3];
1799
1800 values[n++] = 1 + leader->nr_siblings;
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = leader->total_time_enabled +
1803 atomic64_read(&leader->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = leader->total_time_running +
1807 atomic64_read(&leader->child_total_time_running);
1808 }
1809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
1813 return -EFAULT;
1814
1815 err = perf_counter_read_entry(leader, read_format, buf + size);
1816 if (err < 0)
1817 return err;
1818
1819 size += err;
1820
1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1822 err = perf_counter_read_entry(sub, read_format,
1823 buf + size);
1824 if (err < 0)
1825 return err;
1826
1827 size += err;
1828 }
1829
1830 return size;
1831}
1832
1833static int perf_counter_read_one(struct perf_counter *counter,
1834 u64 read_format, char __user *buf)
1835{
1836 u64 values[4];
1837 int n = 0;
1838
1839 values[n++] = perf_counter_read_value(counter);
1840 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1841 values[n++] = counter->total_time_enabled +
1842 atomic64_read(&counter->child_total_time_enabled);
1843 }
1844 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1845 values[n++] = counter->total_time_running +
1846 atomic64_read(&counter->child_total_time_running);
1847 }
1848 if (read_format & PERF_FORMAT_ID)
1849 values[n++] = primary_counter_id(counter);
1850
1851 if (copy_to_user(buf, values, n * sizeof(u64)))
1852 return -EFAULT;
1853
1854 return n * sizeof(u64);
1855}
1856
1691/* 1857/*
1692 * Read the performance counter - simple non blocking version for now 1858 * Read the performance counter - simple non blocking version for now
1693 */ 1859 */
1694static ssize_t 1860static ssize_t
1695perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1861perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1696{ 1862{
1697 u64 values[4]; 1863 u64 read_format = counter->attr.read_format;
1698 int n; 1864 int ret;
1699 1865
1700 /* 1866 /*
1701 * Return end-of-file for a read on a counter that is in 1867 * Return end-of-file for a read on a counter that is in
@@ -1705,28 +1871,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1705 if (counter->state == PERF_COUNTER_STATE_ERROR) 1871 if (counter->state == PERF_COUNTER_STATE_ERROR)
1706 return 0; 1872 return 0;
1707 1873
1874 if (count < perf_counter_read_size(counter))
1875 return -ENOSPC;
1876
1708 WARN_ON_ONCE(counter->ctx->parent_ctx); 1877 WARN_ON_ONCE(counter->ctx->parent_ctx);
1709 mutex_lock(&counter->child_mutex); 1878 mutex_lock(&counter->child_mutex);
1710 values[0] = perf_counter_read(counter); 1879 if (read_format & PERF_FORMAT_GROUP)
1711 n = 1; 1880 ret = perf_counter_read_group(counter, read_format, buf);
1712 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1881 else
1713 values[n++] = counter->total_time_enabled + 1882 ret = perf_counter_read_one(counter, read_format, buf);
1714 atomic64_read(&counter->child_total_time_enabled);
1715 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1716 values[n++] = counter->total_time_running +
1717 atomic64_read(&counter->child_total_time_running);
1718 if (counter->attr.read_format & PERF_FORMAT_ID)
1719 values[n++] = primary_counter_id(counter);
1720 mutex_unlock(&counter->child_mutex); 1883 mutex_unlock(&counter->child_mutex);
1721 1884
1722 if (count < n * sizeof(u64)) 1885 return ret;
1723 return -EINVAL;
1724 count = n * sizeof(u64);
1725
1726 if (copy_to_user(buf, values, count))
1727 return -EFAULT;
1728
1729 return count;
1730} 1886}
1731 1887
1732static ssize_t 1888static ssize_t
@@ -1832,6 +1988,8 @@ unlock:
1832 return ret; 1988 return ret;
1833} 1989}
1834 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1835static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1836{ 1994{
1837 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1855,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1855 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1856 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1857 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1858 default: 2019 default:
1859 return -ENOTTY; 2020 return -ENOTTY;
1860 } 2021 }
@@ -1891,6 +2052,10 @@ int perf_counter_task_disable(void)
1891 return 0; 2052 return 0;
1892} 2053}
1893 2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
1894static int perf_counter_index(struct perf_counter *counter) 2059static int perf_counter_index(struct perf_counter *counter)
1895{ 2060{
1896 if (counter->state != PERF_COUNTER_STATE_ACTIVE) 2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@ -2121,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2121 2286
2122 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2123 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2124 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2125 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
2126 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -2230,7 +2400,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
2230 2400
2231 if (counter->pending_disable) { 2401 if (counter->pending_disable) {
2232 counter->pending_disable = 0; 2402 counter->pending_disable = 0;
2233 perf_counter_disable(counter); 2403 __perf_counter_disable(counter);
2234 } 2404 }
2235 2405
2236 if (counter->pending_wakeup) { 2406 if (counter->pending_wakeup) {
@@ -2506,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
2506 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2507 int nmi, int sample) 2677 int nmi, int sample)
2508{ 2678{
2679 struct perf_counter *output_counter;
2509 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2510 unsigned int offset, head; 2681 unsigned int offset, head;
2511 int have_lost; 2682 int have_lost;
@@ -2515,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
2515 u64 lost; 2686 u64 lost;
2516 } lost_event; 2687 } lost_event;
2517 2688
2689 rcu_read_lock();
2518 /* 2690 /*
2519 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2520 */ 2692 */
2521 if (counter->parent) 2693 if (counter->parent)
2522 counter = counter->parent; 2694 counter = counter->parent;
2523 2695
2524 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2525 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2526 if (!data) 2701 if (!data)
2527 goto out; 2702 goto out;
@@ -2615,7 +2790,80 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2615 return task_pid_nr_ns(p, counter->ns); 2790 return task_pid_nr_ns(p, counter->ns);
2616} 2791}
2617 2792
2618static void perf_counter_output(struct perf_counter *counter, int nmi, 2793static void perf_output_read_one(struct perf_output_handle *handle,
2794 struct perf_counter *counter)
2795{
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[4];
2798 int n = 0;
2799
2800 values[n++] = atomic64_read(&counter->count);
2801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2802 values[n++] = counter->total_time_enabled +
2803 atomic64_read(&counter->child_total_time_enabled);
2804 }
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2806 values[n++] = counter->total_time_running +
2807 atomic64_read(&counter->child_total_time_running);
2808 }
2809 if (read_format & PERF_FORMAT_ID)
2810 values[n++] = primary_counter_id(counter);
2811
2812 perf_output_copy(handle, values, n * sizeof(u64));
2813}
2814
2815/*
2816 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2817 */
2818static void perf_output_read_group(struct perf_output_handle *handle,
2819 struct perf_counter *counter)
2820{
2821 struct perf_counter *leader = counter->group_leader, *sub;
2822 u64 read_format = counter->attr.read_format;
2823 u64 values[5];
2824 int n = 0;
2825
2826 values[n++] = 1 + leader->nr_siblings;
2827
2828 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2829 values[n++] = leader->total_time_enabled;
2830
2831 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2832 values[n++] = leader->total_time_running;
2833
2834 if (leader != counter)
2835 leader->pmu->read(leader);
2836
2837 values[n++] = atomic64_read(&leader->count);
2838 if (read_format & PERF_FORMAT_ID)
2839 values[n++] = primary_counter_id(leader);
2840
2841 perf_output_copy(handle, values, n * sizeof(u64));
2842
2843 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2844 n = 0;
2845
2846 if (sub != counter)
2847 sub->pmu->read(sub);
2848
2849 values[n++] = atomic64_read(&sub->count);
2850 if (read_format & PERF_FORMAT_ID)
2851 values[n++] = primary_counter_id(sub);
2852
2853 perf_output_copy(handle, values, n * sizeof(u64));
2854 }
2855}
2856
2857static void perf_output_read(struct perf_output_handle *handle,
2858 struct perf_counter *counter)
2859{
2860 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2861 perf_output_read_group(handle, counter);
2862 else
2863 perf_output_read_one(handle, counter);
2864}
2865
2866void perf_counter_output(struct perf_counter *counter, int nmi,
2619 struct perf_sample_data *data) 2867 struct perf_sample_data *data)
2620{ 2868{
2621 int ret; 2869 int ret;
@@ -2626,10 +2874,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2626 struct { 2874 struct {
2627 u32 pid, tid; 2875 u32 pid, tid;
2628 } tid_entry; 2876 } tid_entry;
2629 struct {
2630 u64 id;
2631 u64 counter;
2632 } group_entry;
2633 struct perf_callchain_entry *callchain = NULL; 2877 struct perf_callchain_entry *callchain = NULL;
2634 int callchain_size = 0; 2878 int callchain_size = 0;
2635 u64 time; 2879 u64 time;
@@ -2684,10 +2928,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2684 if (sample_type & PERF_SAMPLE_PERIOD) 2928 if (sample_type & PERF_SAMPLE_PERIOD)
2685 header.size += sizeof(u64); 2929 header.size += sizeof(u64);
2686 2930
2687 if (sample_type & PERF_SAMPLE_GROUP) { 2931 if (sample_type & PERF_SAMPLE_READ)
2688 header.size += sizeof(u64) + 2932 header.size += perf_counter_read_size(counter);
2689 counter->nr_siblings * sizeof(group_entry);
2690 }
2691 2933
2692 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2693 callchain = perf_callchain(data->regs); 2935 callchain = perf_callchain(data->regs);
@@ -2699,6 +2941,18 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2699 header.size += sizeof(u64); 2941 header.size += sizeof(u64);
2700 } 2942 }
2701 2943
2944 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32);
2946
2947 if (data->raw)
2948 size += data->raw->size;
2949 else
2950 size += sizeof(u32);
2951
2952 WARN_ON_ONCE(size & (sizeof(u64)-1));
2953 header.size += size;
2954 }
2955
2702 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2703 if (ret) 2957 if (ret)
2704 return; 2958 return;
@@ -2732,26 +2986,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2732 if (sample_type & PERF_SAMPLE_PERIOD) 2986 if (sample_type & PERF_SAMPLE_PERIOD)
2733 perf_output_put(&handle, data->period); 2987 perf_output_put(&handle, data->period);
2734 2988
2735 /* 2989 if (sample_type & PERF_SAMPLE_READ)
2736 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2990 perf_output_read(&handle, counter);
2737 */
2738 if (sample_type & PERF_SAMPLE_GROUP) {
2739 struct perf_counter *leader, *sub;
2740 u64 nr = counter->nr_siblings;
2741
2742 perf_output_put(&handle, nr);
2743
2744 leader = counter->group_leader;
2745 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2746 if (sub != counter)
2747 sub->pmu->read(sub);
2748
2749 group_entry.id = primary_counter_id(sub);
2750 group_entry.counter = atomic64_read(&sub->count);
2751
2752 perf_output_put(&handle, group_entry);
2753 }
2754 }
2755 2991
2756 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2757 if (callchain) 2993 if (callchain)
@@ -2762,6 +2998,22 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2762 } 2998 }
2763 } 2999 }
2764 3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) {
3003 perf_output_put(&handle, data->raw->size);
3004 perf_output_copy(&handle, data->raw->data, data->raw->size);
3005 } else {
3006 struct {
3007 u32 size;
3008 u32 data;
3009 } raw = {
3010 .size = sizeof(u32),
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 }
3016
2765 perf_output_end(&handle); 3017 perf_output_end(&handle);
2766} 3018}
2767 3019
@@ -2774,8 +3026,6 @@ struct perf_read_event {
2774 3026
2775 u32 pid; 3027 u32 pid;
2776 u32 tid; 3028 u32 tid;
2777 u64 value;
2778 u64 format[3];
2779}; 3029};
2780 3030
2781static void 3031static void
@@ -2787,80 +3037,74 @@ perf_counter_read_event(struct perf_counter *counter,
2787 .header = { 3037 .header = {
2788 .type = PERF_EVENT_READ, 3038 .type = PERF_EVENT_READ,
2789 .misc = 0, 3039 .misc = 0,
2790 .size = sizeof(event) - sizeof(event.format), 3040 .size = sizeof(event) + perf_counter_read_size(counter),
2791 }, 3041 },
2792 .pid = perf_counter_pid(counter, task), 3042 .pid = perf_counter_pid(counter, task),
2793 .tid = perf_counter_tid(counter, task), 3043 .tid = perf_counter_tid(counter, task),
2794 .value = atomic64_read(&counter->count),
2795 }; 3044 };
2796 int ret, i = 0; 3045 int ret;
2797
2798 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2799 event.header.size += sizeof(u64);
2800 event.format[i++] = counter->total_time_enabled;
2801 }
2802
2803 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2804 event.header.size += sizeof(u64);
2805 event.format[i++] = counter->total_time_running;
2806 }
2807
2808 if (counter->attr.read_format & PERF_FORMAT_ID) {
2809 event.header.size += sizeof(u64);
2810 event.format[i++] = primary_counter_id(counter);
2811 }
2812 3046
2813 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); 3047 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2814 if (ret) 3048 if (ret)
2815 return; 3049 return;
2816 3050
2817 perf_output_copy(&handle, &event, event.header.size); 3051 perf_output_put(&handle, event);
3052 perf_output_read(&handle, counter);
3053
2818 perf_output_end(&handle); 3054 perf_output_end(&handle);
2819} 3055}
2820 3056
2821/* 3057/*
2822 * fork tracking 3058 * task tracking -- fork/exit
3059 *
3060 * enabled by: attr.comm | attr.mmap | attr.task
2823 */ 3061 */
2824 3062
2825struct perf_fork_event { 3063struct perf_task_event {
2826 struct task_struct *task; 3064 struct task_struct *task;
3065 struct perf_counter_context *task_ctx;
2827 3066
2828 struct { 3067 struct {
2829 struct perf_event_header header; 3068 struct perf_event_header header;
2830 3069
2831 u32 pid; 3070 u32 pid;
2832 u32 ppid; 3071 u32 ppid;
3072 u32 tid;
3073 u32 ptid;
2833 } event; 3074 } event;
2834}; 3075};
2835 3076
2836static void perf_counter_fork_output(struct perf_counter *counter, 3077static void perf_counter_task_output(struct perf_counter *counter,
2837 struct perf_fork_event *fork_event) 3078 struct perf_task_event *task_event)
2838{ 3079{
2839 struct perf_output_handle handle; 3080 struct perf_output_handle handle;
2840 int size = fork_event->event.header.size; 3081 int size = task_event->event.header.size;
2841 struct task_struct *task = fork_event->task; 3082 struct task_struct *task = task_event->task;
2842 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3083 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2843 3084
2844 if (ret) 3085 if (ret)
2845 return; 3086 return;
2846 3087
2847 fork_event->event.pid = perf_counter_pid(counter, task); 3088 task_event->event.pid = perf_counter_pid(counter, task);
2848 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); 3089 task_event->event.ppid = perf_counter_pid(counter, current);
3090
3091 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current);
2849 3093
2850 perf_output_put(&handle, fork_event->event); 3094 perf_output_put(&handle, task_event->event);
2851 perf_output_end(&handle); 3095 perf_output_end(&handle);
2852} 3096}
2853 3097
2854static int perf_counter_fork_match(struct perf_counter *counter) 3098static int perf_counter_task_match(struct perf_counter *counter)
2855{ 3099{
2856 if (counter->attr.comm || counter->attr.mmap) 3100 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
2857 return 1; 3101 return 1;
2858 3102
2859 return 0; 3103 return 0;
2860} 3104}
2861 3105
2862static void perf_counter_fork_ctx(struct perf_counter_context *ctx, 3106static void perf_counter_task_ctx(struct perf_counter_context *ctx,
2863 struct perf_fork_event *fork_event) 3107 struct perf_task_event *task_event)
2864{ 3108{
2865 struct perf_counter *counter; 3109 struct perf_counter *counter;
2866 3110
@@ -2869,54 +3113,62 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2869 3113
2870 rcu_read_lock(); 3114 rcu_read_lock();
2871 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3115 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2872 if (perf_counter_fork_match(counter)) 3116 if (perf_counter_task_match(counter))
2873 perf_counter_fork_output(counter, fork_event); 3117 perf_counter_task_output(counter, task_event);
2874 } 3118 }
2875 rcu_read_unlock(); 3119 rcu_read_unlock();
2876} 3120}
2877 3121
2878static void perf_counter_fork_event(struct perf_fork_event *fork_event) 3122static void perf_counter_task_event(struct perf_task_event *task_event)
2879{ 3123{
2880 struct perf_cpu_context *cpuctx; 3124 struct perf_cpu_context *cpuctx;
2881 struct perf_counter_context *ctx; 3125 struct perf_counter_context *ctx = task_event->task_ctx;
2882 3126
2883 cpuctx = &get_cpu_var(perf_cpu_context); 3127 cpuctx = &get_cpu_var(perf_cpu_context);
2884 perf_counter_fork_ctx(&cpuctx->ctx, fork_event); 3128 perf_counter_task_ctx(&cpuctx->ctx, task_event);
2885 put_cpu_var(perf_cpu_context); 3129 put_cpu_var(perf_cpu_context);
2886 3130
2887 rcu_read_lock(); 3131 rcu_read_lock();
2888 /* 3132 if (!ctx)
2889 * doesn't really matter which of the child contexts the 3133 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
2890 * events ends up in.
2891 */
2892 ctx = rcu_dereference(current->perf_counter_ctxp);
2893 if (ctx) 3134 if (ctx)
2894 perf_counter_fork_ctx(ctx, fork_event); 3135 perf_counter_task_ctx(ctx, task_event);
2895 rcu_read_unlock(); 3136 rcu_read_unlock();
2896} 3137}
2897 3138
2898void perf_counter_fork(struct task_struct *task) 3139static void perf_counter_task(struct task_struct *task,
3140 struct perf_counter_context *task_ctx,
3141 int new)
2899{ 3142{
2900 struct perf_fork_event fork_event; 3143 struct perf_task_event task_event;
2901 3144
2902 if (!atomic_read(&nr_comm_counters) && 3145 if (!atomic_read(&nr_comm_counters) &&
2903 !atomic_read(&nr_mmap_counters)) 3146 !atomic_read(&nr_mmap_counters) &&
3147 !atomic_read(&nr_task_counters))
2904 return; 3148 return;
2905 3149
2906 fork_event = (struct perf_fork_event){ 3150 task_event = (struct perf_task_event){
2907 .task = task, 3151 .task = task,
2908 .event = { 3152 .task_ctx = task_ctx,
3153 .event = {
2909 .header = { 3154 .header = {
2910 .type = PERF_EVENT_FORK, 3155 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
2911 .misc = 0, 3156 .misc = 0,
2912 .size = sizeof(fork_event.event), 3157 .size = sizeof(task_event.event),
2913 }, 3158 },
2914 /* .pid */ 3159 /* .pid */
2915 /* .ppid */ 3160 /* .ppid */
3161 /* .tid */
3162 /* .ptid */
2916 }, 3163 },
2917 }; 3164 };
2918 3165
2919 perf_counter_fork_event(&fork_event); 3166 perf_counter_task_event(&task_event);
3167}
3168
3169void perf_counter_fork(struct task_struct *task)
3170{
3171 perf_counter_task(task, NULL, 1);
2920} 3172}
2921 3173
2922/* 3174/*
@@ -3305,125 +3557,111 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3305 * Generic software counter infrastructure 3557 * Generic software counter infrastructure
3306 */ 3558 */
3307 3559
3308static void perf_swcounter_update(struct perf_counter *counter) 3560/*
3561 * We directly increment counter->count and keep a second value in
3562 * counter->hw.period_left to count intervals. This period counter
3563 * is kept in the range [-sample_period, 0] so that we can use the
3564 * sign as trigger.
3565 */
3566
3567static u64 perf_swcounter_set_period(struct perf_counter *counter)
3309{ 3568{
3310 struct hw_perf_counter *hwc = &counter->hw; 3569 struct hw_perf_counter *hwc = &counter->hw;
3311 u64 prev, now; 3570 u64 period = hwc->last_period;
3312 s64 delta; 3571 u64 nr, offset;
3572 s64 old, val;
3573
3574 hwc->last_period = hwc->sample_period;
3313 3575
3314again: 3576again:
3315 prev = atomic64_read(&hwc->prev_count); 3577 old = val = atomic64_read(&hwc->period_left);
3316 now = atomic64_read(&hwc->count); 3578 if (val < 0)
3317 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) 3579 return 0;
3318 goto again;
3319 3580
3320 delta = now - prev; 3581 nr = div64_u64(period + val, period);
3582 offset = nr * period;
3583 val -= offset;
3584 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3585 goto again;
3321 3586
3322 atomic64_add(delta, &counter->count); 3587 return nr;
3323 atomic64_sub(delta, &hwc->period_left);
3324} 3588}
3325 3589
3326static void perf_swcounter_set_period(struct perf_counter *counter) 3590static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data)
3327{ 3592{
3328 struct hw_perf_counter *hwc = &counter->hw; 3593 struct hw_perf_counter *hwc = &counter->hw;
3329 s64 left = atomic64_read(&hwc->period_left); 3594 u64 overflow;
3330 s64 period = hwc->sample_period;
3331 3595
3332 if (unlikely(left <= -period)) { 3596 data->period = counter->hw.last_period;
3333 left = period; 3597 overflow = perf_swcounter_set_period(counter);
3334 atomic64_set(&hwc->period_left, left);
3335 hwc->last_period = period;
3336 }
3337 3598
3338 if (unlikely(left <= 0)) { 3599 if (hwc->interrupts == MAX_INTERRUPTS)
3339 left += period; 3600 return;
3340 atomic64_add(period, &hwc->period_left);
3341 hwc->last_period = period;
3342 }
3343 3601
3344 atomic64_set(&hwc->prev_count, -left); 3602 for (; overflow; overflow--) {
3345 atomic64_set(&hwc->count, -left); 3603 if (perf_counter_overflow(counter, nmi, data)) {
3604 /*
3605 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS.
3607 */
3608 break;
3609 }
3610 }
3346} 3611}
3347 3612
3348static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) 3613static void perf_swcounter_unthrottle(struct perf_counter *counter)
3349{ 3614{
3350 enum hrtimer_restart ret = HRTIMER_RESTART;
3351 struct perf_sample_data data;
3352 struct perf_counter *counter;
3353 u64 period;
3354
3355 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3356 counter->pmu->read(counter);
3357
3358 data.addr = 0;
3359 data.regs = get_irq_regs();
3360 /* 3615 /*
3361 * In case we exclude kernel IPs or are somehow not in interrupt 3616 * Nothing to do, we already reset hwc->interrupts.
3362 * context, provide the next best thing, the user IP.
3363 */ 3617 */
3364 if ((counter->attr.exclude_kernel || !data.regs) && 3618}
3365 !counter->attr.exclude_user)
3366 data.regs = task_pt_regs(current);
3367 3619
3368 if (data.regs) { 3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3369 if (perf_counter_overflow(counter, 0, &data)) 3621 int nmi, struct perf_sample_data *data)
3370 ret = HRTIMER_NORESTART; 3622{
3371 } 3623 struct hw_perf_counter *hwc = &counter->hw;
3372 3624
3373 period = max_t(u64, 10000, counter->hw.sample_period); 3625 atomic64_add(nr, &counter->count);
3374 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3375 3626
3376 return ret; 3627 if (!hwc->sample_period)
3377} 3628 return;
3378 3629
3379static void perf_swcounter_overflow(struct perf_counter *counter, 3630 if (!data->regs)
3380 int nmi, struct perf_sample_data *data) 3631 return;
3381{
3382 data->period = counter->hw.last_period;
3383 3632
3384 perf_swcounter_update(counter); 3633 if (!atomic64_add_negative(nr, &hwc->period_left))
3385 perf_swcounter_set_period(counter); 3634 perf_swcounter_overflow(counter, nmi, data);
3386 if (perf_counter_overflow(counter, nmi, data))
3387 /* soft-disable the counter */
3388 ;
3389} 3635}
3390 3636
3391static int perf_swcounter_is_counting(struct perf_counter *counter) 3637static int perf_swcounter_is_counting(struct perf_counter *counter)
3392{ 3638{
3393 struct perf_counter_context *ctx; 3639 /*
3394 unsigned long flags; 3640 * The counter is active, we're good!
3395 int count; 3641 */
3396
3397 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3642 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3398 return 1; 3643 return 1;
3399 3644
3645 /*
3646 * The counter is off/error, not counting.
3647 */
3400 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3648 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3401 return 0; 3649 return 0;
3402 3650
3403 /* 3651 /*
3404 * If the counter is inactive, it could be just because 3652 * The counter is inactive, if the context is active
3405 * its task is scheduled out, or because it's in a group 3653 * we're part of a group that didn't make it on the 'pmu',
3406 * which could not go on the PMU. We want to count in 3654 * not counting.
3407 * the first case but not the second. If the context is
3408 * currently active then an inactive software counter must
3409 * be the second case. If it's not currently active then
3410 * we need to know whether the counter was active when the
3411 * context was last active, which we can determine by
3412 * comparing counter->tstamp_stopped with ctx->time.
3413 *
3414 * We are within an RCU read-side critical section,
3415 * which protects the existence of *ctx.
3416 */ 3655 */
3417 ctx = counter->ctx; 3656 if (counter->ctx->is_active)
3418 spin_lock_irqsave(&ctx->lock, flags); 3657 return 0;
3419 count = 1; 3658
3420 /* Re-check state now we have the lock */ 3659 /*
3421 if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3660 * We're inactive and the context is too, this means the
3422 counter->ctx->is_active || 3661 * task is scheduled out, we're counting events that happen
3423 counter->tstamp_stopped < ctx->time) 3662 * to us, like migration events.
3424 count = 0; 3663 */
3425 spin_unlock_irqrestore(&ctx->lock, flags); 3664 return 1;
3426 return count;
3427} 3665}
3428 3666
3429static int perf_swcounter_match(struct perf_counter *counter, 3667static int perf_swcounter_match(struct perf_counter *counter,
@@ -3449,15 +3687,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
3449 return 1; 3687 return 1;
3450} 3688}
3451 3689
3452static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3453 int nmi, struct perf_sample_data *data)
3454{
3455 int neg = atomic64_add_negative(nr, &counter->hw.count);
3456
3457 if (counter->hw.sample_period && !neg && data->regs)
3458 perf_swcounter_overflow(counter, nmi, data);
3459}
3460
3461static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3462 enum perf_type_id type, 3691 enum perf_type_id type,
3463 u32 event, u64 nr, int nmi, 3692 u32 event, u64 nr, int nmi,
@@ -3536,27 +3765,66 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3536 3765
3537static void perf_swcounter_read(struct perf_counter *counter) 3766static void perf_swcounter_read(struct perf_counter *counter)
3538{ 3767{
3539 perf_swcounter_update(counter);
3540} 3768}
3541 3769
3542static int perf_swcounter_enable(struct perf_counter *counter) 3770static int perf_swcounter_enable(struct perf_counter *counter)
3543{ 3771{
3544 perf_swcounter_set_period(counter); 3772 struct hw_perf_counter *hwc = &counter->hw;
3773
3774 if (hwc->sample_period) {
3775 hwc->last_period = hwc->sample_period;
3776 perf_swcounter_set_period(counter);
3777 }
3545 return 0; 3778 return 0;
3546} 3779}
3547 3780
3548static void perf_swcounter_disable(struct perf_counter *counter) 3781static void perf_swcounter_disable(struct perf_counter *counter)
3549{ 3782{
3550 perf_swcounter_update(counter);
3551} 3783}
3552 3784
3553static const struct pmu perf_ops_generic = { 3785static const struct pmu perf_ops_generic = {
3554 .enable = perf_swcounter_enable, 3786 .enable = perf_swcounter_enable,
3555 .disable = perf_swcounter_disable, 3787 .disable = perf_swcounter_disable,
3556 .read = perf_swcounter_read, 3788 .read = perf_swcounter_read,
3789 .unthrottle = perf_swcounter_unthrottle,
3557}; 3790};
3558 3791
3559/* 3792/*
3793 * hrtimer based swcounter callback
3794 */
3795
3796static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{
3798 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data;
3800 struct perf_counter *counter;
3801 u64 period;
3802
3803 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3804 counter->pmu->read(counter);
3805
3806 data.addr = 0;
3807 data.regs = get_irq_regs();
3808 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP.
3811 */
3812 if ((counter->attr.exclude_kernel || !data.regs) &&
3813 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current);
3815
3816 if (data.regs) {
3817 if (perf_counter_overflow(counter, 0, &data))
3818 ret = HRTIMER_NORESTART;
3819 }
3820
3821 period = max_t(u64, 10000, counter->hw.sample_period);
3822 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3823
3824 return ret;
3825}
3826
3827/*
3560 * Software counter: cpu wall time clock 3828 * Software counter: cpu wall time clock
3561 */ 3829 */
3562 3830
@@ -3673,17 +3941,24 @@ static const struct pmu perf_ops_task_clock = {
3673}; 3941};
3674 3942
3675#ifdef CONFIG_EVENT_PROFILE 3943#ifdef CONFIG_EVENT_PROFILE
3676void perf_tpcounter_event(int event_id) 3944void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3945 int entry_size)
3677{ 3946{
3947 struct perf_raw_record raw = {
3948 .size = entry_size,
3949 .data = record,
3950 };
3951
3678 struct perf_sample_data data = { 3952 struct perf_sample_data data = {
3679 .regs = get_irq_regs(), 3953 .regs = get_irq_regs(),
3680 .addr = 0, 3954 .addr = addr,
3955 .raw = &raw,
3681 }; 3956 };
3682 3957
3683 if (!data.regs) 3958 if (!data.regs)
3684 data.regs = task_pt_regs(current); 3959 data.regs = task_pt_regs(current);
3685 3960
3686 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); 3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3687} 3962}
3688EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3963EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3689 3964
@@ -3697,6 +3972,15 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
3697 3972
3698static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) 3973static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3699{ 3974{
3975 /*
3976 * Raw tracepoint data is a severe data leak, only allow root to
3977 * have these.
3978 */
3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3981 !capable(CAP_SYS_ADMIN))
3982 return ERR_PTR(-EPERM);
3983
3700 if (ftrace_profile_enable(counter->attr.config)) 3984 if (ftrace_profile_enable(counter->attr.config))
3701 return NULL; 3985 return NULL;
3702 3986
@@ -3826,13 +4110,14 @@ perf_counter_alloc(struct perf_counter_attr *attr,
3826 hwc->sample_period = attr->sample_period; 4110 hwc->sample_period = attr->sample_period;
3827 if (attr->freq && attr->sample_freq) 4111 if (attr->freq && attr->sample_freq)
3828 hwc->sample_period = 1; 4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
3829 4114
3830 atomic64_set(&hwc->period_left, hwc->sample_period); 4115 atomic64_set(&hwc->period_left, hwc->sample_period);
3831 4116
3832 /* 4117 /*
3833 * we currently do not support PERF_SAMPLE_GROUP on inherited counters 4118 * we currently do not support PERF_FORMAT_GROUP on inherited counters
3834 */ 4119 */
3835 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 4120 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
3836 goto done; 4121 goto done;
3837 4122
3838 switch (attr->type) { 4123 switch (attr->type) {
@@ -3875,6 +4160,8 @@ done:
3875 atomic_inc(&nr_mmap_counters); 4160 atomic_inc(&nr_mmap_counters);
3876 if (counter->attr.comm) 4161 if (counter->attr.comm)
3877 atomic_inc(&nr_comm_counters); 4162 atomic_inc(&nr_comm_counters);
4163 if (counter->attr.task)
4164 atomic_inc(&nr_task_counters);
3878 } 4165 }
3879 4166
3880 return counter; 4167 return counter;
@@ -3959,6 +4246,57 @@ err_size:
3959 goto out; 4246 goto out;
3960} 4247}
3961 4248
4249int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4250{
4251 struct perf_counter *output_counter = NULL;
4252 struct file *output_file = NULL;
4253 struct perf_counter *old_output;
4254 int fput_needed = 0;
4255 int ret = -EINVAL;
4256
4257 if (!output_fd)
4258 goto set;
4259
4260 output_file = fget_light(output_fd, &fput_needed);
4261 if (!output_file)
4262 return -EBADF;
4263
4264 if (output_file->f_op != &perf_fops)
4265 goto out;
4266
4267 output_counter = output_file->private_data;
4268
4269 /* Don't chain output fds */
4270 if (output_counter->output)
4271 goto out;
4272
4273 /* Don't set an output fd when we already have an output channel */
4274 if (counter->data)
4275 goto out;
4276
4277 atomic_long_inc(&output_file->f_count);
4278
4279set:
4280 mutex_lock(&counter->mmap_mutex);
4281 old_output = counter->output;
4282 rcu_assign_pointer(counter->output, output_counter);
4283 mutex_unlock(&counter->mmap_mutex);
4284
4285 if (old_output) {
4286 /*
4287 * we need to make sure no existing perf_output_*()
4288 * is still referencing this counter.
4289 */
4290 synchronize_rcu();
4291 fput(old_output->filp);
4292 }
4293
4294 ret = 0;
4295out:
4296 fput_light(output_file, fput_needed);
4297 return ret;
4298}
4299
3962/** 4300/**
3963 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4301 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3964 * 4302 *
@@ -3978,15 +4316,15 @@ SYSCALL_DEFINE5(perf_counter_open,
3978 struct file *group_file = NULL; 4316 struct file *group_file = NULL;
3979 int fput_needed = 0; 4317 int fput_needed = 0;
3980 int fput_needed2 = 0; 4318 int fput_needed2 = 0;
3981 int ret; 4319 int err;
3982 4320
3983 /* for future expandability... */ 4321 /* for future expandability... */
3984 if (flags) 4322 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
3985 return -EINVAL; 4323 return -EINVAL;
3986 4324
3987 ret = perf_copy_attr(attr_uptr, &attr); 4325 err = perf_copy_attr(attr_uptr, &attr);
3988 if (ret) 4326 if (err)
3989 return ret; 4327 return err;
3990 4328
3991 if (!attr.exclude_kernel) { 4329 if (!attr.exclude_kernel) {
3992 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4330 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4009,8 +4347,8 @@ SYSCALL_DEFINE5(perf_counter_open,
4009 * Look up the group leader (we will attach this counter to it): 4347 * Look up the group leader (we will attach this counter to it):
4010 */ 4348 */
4011 group_leader = NULL; 4349 group_leader = NULL;
4012 if (group_fd != -1) { 4350 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4013 ret = -EINVAL; 4351 err = -EINVAL;
4014 group_file = fget_light(group_fd, &fput_needed); 4352 group_file = fget_light(group_fd, &fput_needed);
4015 if (!group_file) 4353 if (!group_file)
4016 goto err_put_context; 4354 goto err_put_context;
@@ -4039,18 +4377,24 @@ SYSCALL_DEFINE5(perf_counter_open,
4039 4377
4040 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4378 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4041 NULL, GFP_KERNEL); 4379 NULL, GFP_KERNEL);
4042 ret = PTR_ERR(counter); 4380 err = PTR_ERR(counter);
4043 if (IS_ERR(counter)) 4381 if (IS_ERR(counter))
4044 goto err_put_context; 4382 goto err_put_context;
4045 4383
4046 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4384 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4047 if (ret < 0) 4385 if (err < 0)
4048 goto err_free_put_context; 4386 goto err_free_put_context;
4049 4387
4050 counter_file = fget_light(ret, &fput_needed2); 4388 counter_file = fget_light(err, &fput_needed2);
4051 if (!counter_file) 4389 if (!counter_file)
4052 goto err_free_put_context; 4390 goto err_free_put_context;
4053 4391
4392 if (flags & PERF_FLAG_FD_OUTPUT) {
4393 err = perf_counter_set_output(counter, group_fd);
4394 if (err)
4395 goto err_fput_free_put_context;
4396 }
4397
4054 counter->filp = counter_file; 4398 counter->filp = counter_file;
4055 WARN_ON_ONCE(ctx->parent_ctx); 4399 WARN_ON_ONCE(ctx->parent_ctx);
4056 mutex_lock(&ctx->mutex); 4400 mutex_lock(&ctx->mutex);
@@ -4064,20 +4408,20 @@ SYSCALL_DEFINE5(perf_counter_open,
4064 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4408 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4065 mutex_unlock(&current->perf_counter_mutex); 4409 mutex_unlock(&current->perf_counter_mutex);
4066 4410
4411err_fput_free_put_context:
4067 fput_light(counter_file, fput_needed2); 4412 fput_light(counter_file, fput_needed2);
4068 4413
4069out_fput:
4070 fput_light(group_file, fput_needed);
4071
4072 return ret;
4073
4074err_free_put_context: 4414err_free_put_context:
4075 kfree(counter); 4415 if (err < 0)
4416 kfree(counter);
4076 4417
4077err_put_context: 4418err_put_context:
4078 put_ctx(ctx); 4419 if (err < 0)
4420 put_ctx(ctx);
4421
4422 fput_light(group_file, fput_needed);
4079 4423
4080 goto out_fput; 4424 return err;
4081} 4425}
4082 4426
4083/* 4427/*
@@ -4236,8 +4580,10 @@ void perf_counter_exit_task(struct task_struct *child)
4236 struct perf_counter_context *child_ctx; 4580 struct perf_counter_context *child_ctx;
4237 unsigned long flags; 4581 unsigned long flags;
4238 4582
4239 if (likely(!child->perf_counter_ctxp)) 4583 if (likely(!child->perf_counter_ctxp)) {
4584 perf_counter_task(child, NULL, 0);
4240 return; 4585 return;
4586 }
4241 4587
4242 local_irq_save(flags); 4588 local_irq_save(flags);
4243 /* 4589 /*
@@ -4262,8 +4608,14 @@ void perf_counter_exit_task(struct task_struct *child)
4262 * the counters from it. 4608 * the counters from it.
4263 */ 4609 */
4264 unclone_ctx(child_ctx); 4610 unclone_ctx(child_ctx);
4265 spin_unlock(&child_ctx->lock); 4611 spin_unlock_irqrestore(&child_ctx->lock, flags);
4266 local_irq_restore(flags); 4612
4613 /*
4614 * Report the task dead after unscheduling the counters so that we
4615 * won't get any samples after PERF_EVENT_EXIT. We can however still
4616 * get a few PERF_EVENT_READ events.
4617 */
4618 perf_counter_task(child, child_ctx, 0);
4267 4619
4268 /* 4620 /*
4269 * We can recurse on the same lock type through: 4621 * We can recurse on the same lock type through:
@@ -4484,6 +4836,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4484 perf_counter_init_cpu(cpu); 4836 perf_counter_init_cpu(cpu);
4485 break; 4837 break;
4486 4838
4839 case CPU_ONLINE:
4840 case CPU_ONLINE_FROZEN:
4841 hw_perf_counter_setup_online(cpu);
4842 break;
4843
4487 case CPU_DOWN_PREPARE: 4844 case CPU_DOWN_PREPARE:
4488 case CPU_DOWN_PREPARE_FROZEN: 4845 case CPU_DOWN_PREPARE_FROZEN:
4489 perf_counter_exit_cpu(cpu); 4846 perf_counter_exit_cpu(cpu);
@@ -4508,6 +4865,8 @@ void __init perf_counter_init(void)
4508{ 4865{
4509 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4866 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4510 (void *)(long)smp_processor_id()); 4867 (void *)(long)smp_processor_id());
4868 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4869 (void *)(long)smp_processor_id());
4511 register_cpu_notifier(&perf_cpu_nb); 4870 register_cpu_notifier(&perf_cpu_nb);
4512} 4871}
4513 4872