perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2009-03-30 13:07:02 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-06 03:30:36 -0400
commit: 925d519ab82b6dd7aca9420d809ee83819c08db2 (patch)
tree: aa05bd7eb607915aa691d5434ec74521b487b466 /kernel/perf_counter.c
parent: 53cfbf593758916aac41db728f029986a62f1254 (diff)
1 files changed, 120 insertions, 8 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7988cd..f70ff80e79d7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
        kfree(counter);
 }
+static void perf_pending_sync(struct perf_counter *counter);
 static void free_counter(struct perf_counter *counter)
 {
+        perf_pending_sync(counter);
        if (counter->destroy)
                counter->destroy(counter);
@@ -1529,6 +1533,118 @@ static const struct file_operations perf_fops = {
 };
 /*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+        struct perf_mmap_data *data;
+        rcu_read_lock();
+        data = rcu_dereference(counter->data);
+        if (data) {
+                (void)atomic_xchg(&data->wakeup, POLL_IN);
+                __perf_counter_update_userpage(counter, data);
+        }
+        rcu_read_unlock();
+        wake_up_all(&counter->waitq);
+}
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+        PENDING_TAIL,
+};
+static void perf_pending_queue(struct perf_counter *counter)
+{
+        struct perf_wakeup_entry **head;
+        struct perf_wakeup_entry *prev, *next;
+        if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+                return;
+        head = &get_cpu_var(perf_wakeup_head);
+        do {
+                prev = counter->wakeup.next = *head;
+                next = &counter->wakeup;
+        } while (cmpxchg(head, prev, next) != prev);
+        set_perf_counter_pending();
+        put_cpu_var(perf_wakeup_head);
+}
+static int __perf_pending_run(void)
+{
+        struct perf_wakeup_entry *list;
+        int nr = 0;
+        list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+        while (list != PENDING_TAIL) {
+                struct perf_counter *counter = container_of(list,
+                                struct perf_counter, wakeup);
+                list = list->next;
+                counter->wakeup.next = NULL;
+                /*
+                 * Ensure we observe the unqueue before we issue the wakeup,
+                 * so that we won't be waiting forever.
+                 * -- see perf_not_pending().
+                 */
+                smp_wmb();
+                perf_counter_wakeup(counter);
+                nr++;
+        }
+        return nr;
+}
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+        /*
+         * If we flush on whatever cpu we run, there is a chance we don't
+         * need to wait.
+         */
+        get_cpu();
+        __perf_pending_run();
+        put_cpu();
+        /*
+         * Ensure we see the proper queue state before going to sleep
+         * so that we do not miss the wakeup. -- see perf_pending_handle()
+         */
+        smp_rmb();
+        return counter->wakeup.next == NULL;
+}
+static void perf_pending_sync(struct perf_counter *counter)
+{
+        wait_event(counter->waitq, perf_not_pending(counter));
+}
+void perf_counter_do_pending(void)
+{
+        __perf_pending_run();
+}
+/*
 * Output
 */
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
        if (handle->wakeup) {
-                (void)atomic_xchg(&handle->data->wakeup, POLL_IN);
+                if (nmi)
-                __perf_counter_update_userpage(handle->counter, handle->data);
+                        perf_pending_queue(handle->counter);
-                if (nmi) {
+                else
-                        handle->counter->wakeup_pending = 1;
+                        perf_counter_wakeup(handle->counter);
-                        set_perf_counter_pending();
-                } else
-                        wake_up(&handle->counter->waitq);
        }
        rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
        counter->cpu                    = cpu;
        counter->hw_event               = *hw_event;
-        counter->wakeup_pending         = 0;
        counter->group_leader           = group_leader;
        counter->hw_ops                 = NULL;
        counter->ctx                    = ctx;
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2009-03-30 13:07:02 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-06 03:30:36 -0400
commit	925d519ab82b6dd7aca9420d809ee83819c08db2 (patch)
tree	aa05bd7eb607915aa691d5434ec74521b487b466 /kernel/perf_counter.c
parent	53cfbf593758916aac41db728f029986a62f1254 (diff)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b862a7988cd..f70ff80e79d7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
1197	kfree(counter);	1197	kfree(counter);
1198	}	1198	}
1199		1199
		1200	static void perf_pending_sync(struct perf_counter *counter);
		1201
1200	static void free_counter(struct perf_counter *counter)	1202	static void free_counter(struct perf_counter *counter)
1201	{	1203	{
		1204	perf_pending_sync(counter);
		1205
1202	if (counter->destroy)	1206	if (counter->destroy)
1203	counter->destroy(counter);	1207	counter->destroy(counter);
1204		1208
@@ -1529,6 +1533,118 @@ static const struct file_operations perf_fops = {
1529	};	1533	};
1530		1534
1531	/*	1535	/*
		1536	* Perf counter wakeup
		1537	*
		1538	* If there's data, ensure we set the poll() state and publish everything
		1539	* to user-space before waking everybody up.
		1540	*/
		1541
		1542	void perf_counter_wakeup(struct perf_counter *counter)
		1543	{
		1544	struct perf_mmap_data *data;
		1545
		1546	rcu_read_lock();
		1547	data = rcu_dereference(counter->data);
		1548	if (data) {
		1549	(void)atomic_xchg(&data->wakeup, POLL_IN);
		1550	__perf_counter_update_userpage(counter, data);
		1551	}
		1552	rcu_read_unlock();
		1553
		1554	wake_up_all(&counter->waitq);
		1555	}
		1556
		1557	/*
		1558	* Pending wakeups
		1559	*
		1560	* Handle the case where we need to wakeup up from NMI (or rq->lock) context.
		1561	*
		1562	* The NMI bit means we cannot possibly take locks. Therefore, maintain a
		1563	* single linked list and use cmpxchg() to add entries lockless.
		1564	*/
		1565
		1566	#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
		1567
		1568	static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
		1569	PENDING_TAIL,
		1570	};
		1571
		1572	static void perf_pending_queue(struct perf_counter *counter)
		1573	{
		1574	struct perf_wakeup_entry **head;
		1575	struct perf_wakeup_entry prev, next;
		1576
		1577	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
		1578	return;
		1579
		1580	head = &get_cpu_var(perf_wakeup_head);
		1581
		1582	do {
		1583	prev = counter->wakeup.next = *head;
		1584	next = &counter->wakeup;
		1585	} while (cmpxchg(head, prev, next) != prev);
		1586
		1587	set_perf_counter_pending();
		1588
		1589	put_cpu_var(perf_wakeup_head);
		1590	}
		1591
		1592	static int __perf_pending_run(void)
		1593	{
		1594	struct perf_wakeup_entry *list;
		1595	int nr = 0;
		1596
		1597	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
		1598	while (list != PENDING_TAIL) {
		1599	struct perf_counter *counter = container_of(list,
		1600	struct perf_counter, wakeup);
		1601
		1602	list = list->next;
		1603
		1604	counter->wakeup.next = NULL;
		1605	/*
		1606	* Ensure we observe the unqueue before we issue the wakeup,
		1607	* so that we won't be waiting forever.
		1608	* -- see perf_not_pending().
		1609	*/
		1610	smp_wmb();
		1611
		1612	perf_counter_wakeup(counter);
		1613	nr++;
		1614	}
		1615
		1616	return nr;
		1617	}
		1618
		1619	static inline int perf_not_pending(struct perf_counter *counter)
		1620	{
		1621	/*
		1622	* If we flush on whatever cpu we run, there is a chance we don't
		1623	* need to wait.
		1624	*/
		1625	get_cpu();
		1626	__perf_pending_run();
		1627	put_cpu();
		1628
		1629	/*
		1630	* Ensure we see the proper queue state before going to sleep
		1631	* so that we do not miss the wakeup. -- see perf_pending_handle()
		1632	*/
		1633	smp_rmb();
		1634	return counter->wakeup.next == NULL;
		1635	}
		1636
		1637	static void perf_pending_sync(struct perf_counter *counter)
		1638	{
		1639	wait_event(counter->waitq, perf_not_pending(counter));
		1640	}
		1641
		1642	void perf_counter_do_pending(void)
		1643	{
		1644	__perf_pending_run();
		1645	}
		1646
		1647	/*
1532	* Output	1648	* Output
1533	*/	1649	*/
1534		1650
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
1611	static void perf_output_end(struct perf_output_handle *handle, int nmi)	1727	static void perf_output_end(struct perf_output_handle *handle, int nmi)
1612	{	1728	{
1613	if (handle->wakeup) {	1729	if (handle->wakeup) {
1614	(void)atomic_xchg(&handle->data->wakeup, POLL_IN);	1730	if (nmi)
1615	__perf_counter_update_userpage(handle->counter, handle->data);	1731	perf_pending_queue(handle->counter);
1616	if (nmi) {	1732	else
1617	handle->counter->wakeup_pending = 1;	1733	perf_counter_wakeup(handle->counter);
1618	set_perf_counter_pending();
1619	} else
1620	wake_up(&handle->counter->waitq);
1621	}	1734	}
1622	rcu_read_unlock();	1735	rcu_read_unlock();
1623	}	1736	}
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2211		2324
2212	counter->cpu = cpu;	2325	counter->cpu = cpu;
2213	counter->hw_event = *hw_event;	2326	counter->hw_event = *hw_event;
2214	counter->wakeup_pending = 0;
2215	counter->group_leader = group_leader;	2327	counter->group_leader = group_leader;
2216	counter->hw_ops = NULL;	2328	counter->hw_ops = NULL;
2217	counter->ctx = ctx;	2329	counter->ctx = ctx;