aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_counter.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-05-01 06:23:16 -0400
committerIngo Molnar <mingo@elte.hu>2009-05-01 07:23:43 -0400
commitc33a0bc4e41ef169d6e807d8abb9502544b518e5 (patch)
treef7ba55205352cd91a4f86999710008e89932ef10 /kernel/perf_counter.c
parent3c56999eec7acc105a31b4546c94aad2fb844b13 (diff)
perf_counter: fix race in perf_output_*
When two (or more) contexts output to the same buffer, it is possible to observe half written output. Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing perf_counter_overflow(). If CPU1 does a wakeup and exposes head to user-space, then CPU2 can observe the data CPU0 is still writing. [ Impact: fix occasionally corrupted profiling records ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mackerras <paulus@samba.org> Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com> LKML-Reference: <20090501102533.007821627@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r--kernel/perf_counter.c130
1 files changed, 101 insertions, 29 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75f2b6c82392..8660ae579530 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
1279{ 1279{
1280 struct perf_counter *counter = file->private_data; 1280 struct perf_counter *counter = file->private_data;
1281 struct perf_mmap_data *data; 1281 struct perf_mmap_data *data;
1282 unsigned int events; 1282 unsigned int events = POLL_HUP;
1283 1283
1284 rcu_read_lock(); 1284 rcu_read_lock();
1285 data = rcu_dereference(counter->data); 1285 data = rcu_dereference(counter->data);
1286 if (data) 1286 if (data)
1287 events = atomic_xchg(&data->wakeup, 0); 1287 events = atomic_xchg(&data->poll, 0);
1288 else
1289 events = POLL_HUP;
1290 rcu_read_unlock(); 1288 rcu_read_unlock();
1291 1289
1292 poll_wait(file, &counter->waitq, wait); 1290 poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = {
1568 1566
1569void perf_counter_wakeup(struct perf_counter *counter) 1567void perf_counter_wakeup(struct perf_counter *counter)
1570{ 1568{
1571 struct perf_mmap_data *data;
1572
1573 rcu_read_lock();
1574 data = rcu_dereference(counter->data);
1575 if (data) {
1576 atomic_set(&data->wakeup, POLL_IN);
1577 /*
1578 * Ensure all data writes are issued before updating the
1579 * user-space data head information. The matching rmb()
1580 * will be in userspace after reading this value.
1581 */
1582 smp_wmb();
1583 data->user_page->data_head = atomic_read(&data->head);
1584 }
1585 rcu_read_unlock();
1586
1587 wake_up_all(&counter->waitq); 1569 wake_up_all(&counter->waitq);
1588 1570
1589 if (counter->pending_kill) { 1571 if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
1721 int wakeup; 1703 int wakeup;
1722 int nmi; 1704 int nmi;
1723 int overflow; 1705 int overflow;
1706 int locked;
1707 unsigned long flags;
1724}; 1708};
1725 1709
1726static inline void __perf_output_wakeup(struct perf_output_handle *handle) 1710static void perf_output_wakeup(struct perf_output_handle *handle)
1727{ 1711{
1712 atomic_set(&handle->data->poll, POLL_IN);
1713
1728 if (handle->nmi) { 1714 if (handle->nmi) {
1729 handle->counter->pending_wakeup = 1; 1715 handle->counter->pending_wakeup = 1;
1730 perf_pending_queue(&handle->counter->pending, 1716 perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1733 perf_counter_wakeup(handle->counter); 1719 perf_counter_wakeup(handle->counter);
1734} 1720}
1735 1721
1722/*
1723 * Curious locking construct.
1724 *
1725 * We need to ensure a later event doesn't publish a head when a former
1726 * event isn't done writing. However since we need to deal with NMIs we
1727 * cannot fully serialize things.
1728 *
1729 * What we do is serialize between CPUs so we only have to deal with NMI
1730 * nesting on a single CPU.
1731 *
1732 * We only publish the head (and generate a wakeup) when the outer-most
1733 * event completes.
1734 */
1735static void perf_output_lock(struct perf_output_handle *handle)
1736{
1737 struct perf_mmap_data *data = handle->data;
1738 int cpu;
1739
1740 handle->locked = 0;
1741
1742 local_irq_save(handle->flags);
1743 cpu = smp_processor_id();
1744
1745 if (in_nmi() && atomic_read(&data->lock) == cpu)
1746 return;
1747
1748 while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
1749 cpu_relax();
1750
1751 handle->locked = 1;
1752}
1753
1754static void perf_output_unlock(struct perf_output_handle *handle)
1755{
1756 struct perf_mmap_data *data = handle->data;
1757 int head, cpu;
1758
1759 if (handle->wakeup)
1760 data->wakeup_head = data->head;
1761
1762 if (!handle->locked)
1763 goto out;
1764
1765again:
1766 /*
1767 * The xchg implies a full barrier that ensures all writes are done
1768 * before we publish the new head, matched by a rmb() in userspace when
1769 * reading this position.
1770 */
1771 while ((head = atomic_xchg(&data->wakeup_head, 0))) {
1772 data->user_page->data_head = head;
1773 handle->wakeup = 1;
1774 }
1775
1776 /*
1777 * NMI can happen here, which means we can miss a wakeup_head update.
1778 */
1779
1780 cpu = atomic_xchg(&data->lock, 0);
1781 WARN_ON_ONCE(cpu != smp_processor_id());
1782
1783 /*
1784 * Therefore we have to validate we did not indeed do so.
1785 */
1786 if (unlikely(atomic_read(&data->wakeup_head))) {
1787 /*
1788 * Since we had it locked, we can lock it again.
1789 */
1790 while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
1791 cpu_relax();
1792
1793 goto again;
1794 }
1795
1796 if (handle->wakeup)
1797 perf_output_wakeup(handle);
1798out:
1799 local_irq_restore(handle->flags);
1800}
1801
1736static int perf_output_begin(struct perf_output_handle *handle, 1802static int perf_output_begin(struct perf_output_handle *handle,
1737 struct perf_counter *counter, unsigned int size, 1803 struct perf_counter *counter, unsigned int size,
1738 int nmi, int overflow) 1804 int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
1745 if (!data) 1811 if (!data)
1746 goto out; 1812 goto out;
1747 1813
1814 handle->data = data;
1748 handle->counter = counter; 1815 handle->counter = counter;
1749 handle->nmi = nmi; 1816 handle->nmi = nmi;
1750 handle->overflow = overflow; 1817 handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
1752 if (!data->nr_pages) 1819 if (!data->nr_pages)
1753 goto fail; 1820 goto fail;
1754 1821
1822 perf_output_lock(handle);
1823
1755 do { 1824 do {
1756 offset = head = atomic_read(&data->head); 1825 offset = head = atomic_read(&data->head);
1757 head += size; 1826 head += size;
1758 } while (atomic_cmpxchg(&data->head, offset, head) != offset); 1827 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1759 1828
1760 handle->data = data;
1761 handle->offset = offset; 1829 handle->offset = offset;
1762 handle->head = head; 1830 handle->head = head;
1763 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); 1831 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
1765 return 0; 1833 return 0;
1766 1834
1767fail: 1835fail:
1768 __perf_output_wakeup(handle); 1836 perf_output_wakeup(handle);
1769out: 1837out:
1770 rcu_read_unlock(); 1838 rcu_read_unlock();
1771 1839
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle,
1809 1877
1810static void perf_output_end(struct perf_output_handle *handle) 1878static void perf_output_end(struct perf_output_handle *handle)
1811{ 1879{
1812 int wakeup_events = handle->counter->hw_event.wakeup_events; 1880 struct perf_counter *counter = handle->counter;
1881 struct perf_mmap_data *data = handle->data;
1882
1883 int wakeup_events = counter->hw_event.wakeup_events;
1813 1884
1814 if (handle->overflow && wakeup_events) { 1885 if (handle->overflow && wakeup_events) {
1815 int events = atomic_inc_return(&handle->data->events); 1886 int events = atomic_inc_return(&data->events);
1816 if (events >= wakeup_events) { 1887 if (events >= wakeup_events) {
1817 atomic_sub(wakeup_events, &handle->data->events); 1888 atomic_sub(wakeup_events, &data->events);
1818 __perf_output_wakeup(handle); 1889 handle->wakeup = 1;
1819 } 1890 }
1820 } else if (handle->wakeup) 1891 }
1821 __perf_output_wakeup(handle); 1892
1893 perf_output_unlock(handle);
1822 rcu_read_unlock(); 1894 rcu_read_unlock();
1823} 1895}
1824 1896