diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-05-01 06:23:16 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-05-01 07:23:43 -0400 |
commit | c33a0bc4e41ef169d6e807d8abb9502544b518e5 (patch) | |
tree | f7ba55205352cd91a4f86999710008e89932ef10 /kernel/perf_counter.c | |
parent | 3c56999eec7acc105a31b4546c94aad2fb844b13 (diff) |
perf_counter: fix race in perf_output_*
When two (or more) contexts output to the same buffer, it is possible
to observe half written output.
Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.
[ Impact: fix occasionally corrupted profiling records ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.007821627@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_counter.c')
-rw-r--r-- | kernel/perf_counter.c | 130 |
1 files changed, 101 insertions, 29 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75f2b6c82392..8660ae579530 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
1279 | { | 1279 | { |
1280 | struct perf_counter *counter = file->private_data; | 1280 | struct perf_counter *counter = file->private_data; |
1281 | struct perf_mmap_data *data; | 1281 | struct perf_mmap_data *data; |
1282 | unsigned int events; | 1282 | unsigned int events = POLL_HUP; |
1283 | 1283 | ||
1284 | rcu_read_lock(); | 1284 | rcu_read_lock(); |
1285 | data = rcu_dereference(counter->data); | 1285 | data = rcu_dereference(counter->data); |
1286 | if (data) | 1286 | if (data) |
1287 | events = atomic_xchg(&data->wakeup, 0); | 1287 | events = atomic_xchg(&data->poll, 0); |
1288 | else | ||
1289 | events = POLL_HUP; | ||
1290 | rcu_read_unlock(); | 1288 | rcu_read_unlock(); |
1291 | 1289 | ||
1292 | poll_wait(file, &counter->waitq, wait); | 1290 | poll_wait(file, &counter->waitq, wait); |
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = { | |||
1568 | 1566 | ||
1569 | void perf_counter_wakeup(struct perf_counter *counter) | 1567 | void perf_counter_wakeup(struct perf_counter *counter) |
1570 | { | 1568 | { |
1571 | struct perf_mmap_data *data; | ||
1572 | |||
1573 | rcu_read_lock(); | ||
1574 | data = rcu_dereference(counter->data); | ||
1575 | if (data) { | ||
1576 | atomic_set(&data->wakeup, POLL_IN); | ||
1577 | /* | ||
1578 | * Ensure all data writes are issued before updating the | ||
1579 | * user-space data head information. The matching rmb() | ||
1580 | * will be in userspace after reading this value. | ||
1581 | */ | ||
1582 | smp_wmb(); | ||
1583 | data->user_page->data_head = atomic_read(&data->head); | ||
1584 | } | ||
1585 | rcu_read_unlock(); | ||
1586 | |||
1587 | wake_up_all(&counter->waitq); | 1569 | wake_up_all(&counter->waitq); |
1588 | 1570 | ||
1589 | if (counter->pending_kill) { | 1571 | if (counter->pending_kill) { |
@@ -1721,10 +1703,14 @@ struct perf_output_handle { | |||
1721 | int wakeup; | 1703 | int wakeup; |
1722 | int nmi; | 1704 | int nmi; |
1723 | int overflow; | 1705 | int overflow; |
1706 | int locked; | ||
1707 | unsigned long flags; | ||
1724 | }; | 1708 | }; |
1725 | 1709 | ||
1726 | static inline void __perf_output_wakeup(struct perf_output_handle *handle) | 1710 | static void perf_output_wakeup(struct perf_output_handle *handle) |
1727 | { | 1711 | { |
1712 | atomic_set(&handle->data->poll, POLL_IN); | ||
1713 | |||
1728 | if (handle->nmi) { | 1714 | if (handle->nmi) { |
1729 | handle->counter->pending_wakeup = 1; | 1715 | handle->counter->pending_wakeup = 1; |
1730 | perf_pending_queue(&handle->counter->pending, | 1716 | perf_pending_queue(&handle->counter->pending, |
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) | |||
1733 | perf_counter_wakeup(handle->counter); | 1719 | perf_counter_wakeup(handle->counter); |
1734 | } | 1720 | } |
1735 | 1721 | ||
1722 | /* | ||
1723 | * Curious locking construct. | ||
1724 | * | ||
1725 | * We need to ensure a later event doesn't publish a head when a former | ||
1726 | * event isn't done writing. However since we need to deal with NMIs we | ||
1727 | * cannot fully serialize things. | ||
1728 | * | ||
1729 | * What we do is serialize between CPUs so we only have to deal with NMI | ||
1730 | * nesting on a single CPU. | ||
1731 | * | ||
1732 | * We only publish the head (and generate a wakeup) when the outer-most | ||
1733 | * event completes. | ||
1734 | */ | ||
1735 | static void perf_output_lock(struct perf_output_handle *handle) | ||
1736 | { | ||
1737 | struct perf_mmap_data *data = handle->data; | ||
1738 | int cpu; | ||
1739 | |||
1740 | handle->locked = 0; | ||
1741 | |||
1742 | local_irq_save(handle->flags); | ||
1743 | cpu = smp_processor_id(); | ||
1744 | |||
1745 | if (in_nmi() && atomic_read(&data->lock) == cpu) | ||
1746 | return; | ||
1747 | |||
1748 | while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) | ||
1749 | cpu_relax(); | ||
1750 | |||
1751 | handle->locked = 1; | ||
1752 | } | ||
1753 | |||
1754 | static void perf_output_unlock(struct perf_output_handle *handle) | ||
1755 | { | ||
1756 | struct perf_mmap_data *data = handle->data; | ||
1757 | int head, cpu; | ||
1758 | |||
1759 | if (handle->wakeup) | ||
1760 | data->wakeup_head = data->head; | ||
1761 | |||
1762 | if (!handle->locked) | ||
1763 | goto out; | ||
1764 | |||
1765 | again: | ||
1766 | /* | ||
1767 | * The xchg implies a full barrier that ensures all writes are done | ||
1768 | * before we publish the new head, matched by a rmb() in userspace when | ||
1769 | * reading this position. | ||
1770 | */ | ||
1771 | while ((head = atomic_xchg(&data->wakeup_head, 0))) { | ||
1772 | data->user_page->data_head = head; | ||
1773 | handle->wakeup = 1; | ||
1774 | } | ||
1775 | |||
1776 | /* | ||
1777 | * NMI can happen here, which means we can miss a wakeup_head update. | ||
1778 | */ | ||
1779 | |||
1780 | cpu = atomic_xchg(&data->lock, 0); | ||
1781 | WARN_ON_ONCE(cpu != smp_processor_id()); | ||
1782 | |||
1783 | /* | ||
1784 | * Therefore we have to validate we did not indeed do so. | ||
1785 | */ | ||
1786 | if (unlikely(atomic_read(&data->wakeup_head))) { | ||
1787 | /* | ||
1788 | * Since we had it locked, we can lock it again. | ||
1789 | */ | ||
1790 | while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) | ||
1791 | cpu_relax(); | ||
1792 | |||
1793 | goto again; | ||
1794 | } | ||
1795 | |||
1796 | if (handle->wakeup) | ||
1797 | perf_output_wakeup(handle); | ||
1798 | out: | ||
1799 | local_irq_restore(handle->flags); | ||
1800 | } | ||
1801 | |||
1736 | static int perf_output_begin(struct perf_output_handle *handle, | 1802 | static int perf_output_begin(struct perf_output_handle *handle, |
1737 | struct perf_counter *counter, unsigned int size, | 1803 | struct perf_counter *counter, unsigned int size, |
1738 | int nmi, int overflow) | 1804 | int nmi, int overflow) |
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
1745 | if (!data) | 1811 | if (!data) |
1746 | goto out; | 1812 | goto out; |
1747 | 1813 | ||
1814 | handle->data = data; | ||
1748 | handle->counter = counter; | 1815 | handle->counter = counter; |
1749 | handle->nmi = nmi; | 1816 | handle->nmi = nmi; |
1750 | handle->overflow = overflow; | 1817 | handle->overflow = overflow; |
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
1752 | if (!data->nr_pages) | 1819 | if (!data->nr_pages) |
1753 | goto fail; | 1820 | goto fail; |
1754 | 1821 | ||
1822 | perf_output_lock(handle); | ||
1823 | |||
1755 | do { | 1824 | do { |
1756 | offset = head = atomic_read(&data->head); | 1825 | offset = head = atomic_read(&data->head); |
1757 | head += size; | 1826 | head += size; |
1758 | } while (atomic_cmpxchg(&data->head, offset, head) != offset); | 1827 | } while (atomic_cmpxchg(&data->head, offset, head) != offset); |
1759 | 1828 | ||
1760 | handle->data = data; | ||
1761 | handle->offset = offset; | 1829 | handle->offset = offset; |
1762 | handle->head = head; | 1830 | handle->head = head; |
1763 | handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); | 1831 | handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); |
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle, | |||
1765 | return 0; | 1833 | return 0; |
1766 | 1834 | ||
1767 | fail: | 1835 | fail: |
1768 | __perf_output_wakeup(handle); | 1836 | perf_output_wakeup(handle); |
1769 | out: | 1837 | out: |
1770 | rcu_read_unlock(); | 1838 | rcu_read_unlock(); |
1771 | 1839 | ||
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle, | |||
1809 | 1877 | ||
1810 | static void perf_output_end(struct perf_output_handle *handle) | 1878 | static void perf_output_end(struct perf_output_handle *handle) |
1811 | { | 1879 | { |
1812 | int wakeup_events = handle->counter->hw_event.wakeup_events; | 1880 | struct perf_counter *counter = handle->counter; |
1881 | struct perf_mmap_data *data = handle->data; | ||
1882 | |||
1883 | int wakeup_events = counter->hw_event.wakeup_events; | ||
1813 | 1884 | ||
1814 | if (handle->overflow && wakeup_events) { | 1885 | if (handle->overflow && wakeup_events) { |
1815 | int events = atomic_inc_return(&handle->data->events); | 1886 | int events = atomic_inc_return(&data->events); |
1816 | if (events >= wakeup_events) { | 1887 | if (events >= wakeup_events) { |
1817 | atomic_sub(wakeup_events, &handle->data->events); | 1888 | atomic_sub(wakeup_events, &data->events); |
1818 | __perf_output_wakeup(handle); | 1889 | handle->wakeup = 1; |
1819 | } | 1890 | } |
1820 | } else if (handle->wakeup) | 1891 | } |
1821 | __perf_output_wakeup(handle); | 1892 | |
1893 | perf_output_unlock(handle); | ||
1822 | rcu_read_unlock(); | 1894 | rcu_read_unlock(); |
1823 | } | 1895 | } |
1824 | 1896 | ||