1 files changed, 73 insertions, 0 deletions
diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h
new file mode 100644
index 000000000000..9a083ae60473
--- /dev/null
+++ b/tools/include/linux/ring_buffer.h
@@ -0,0 +1,73 @@
+#ifndef _TOOLS_LINUX_RING_BUFFER_H_
+#define _TOOLS_LINUX_RING_BUFFER_H_
+#include <asm/barrier.h>
+/*
+ * Contract with kernel for walking the perf ring buffer from
+ * user space requires the following barrier pairing (quote
+ * from kernel/events/ring_buffer.c):
+ *
+ *   Since the mmap() consumer (userspace) can run on a
+ *   different CPU:
+ *
+ *   kernel                             user
+ *
+ *   if (LOAD ->data_tail) {            LOAD ->data_head
+ *                      (A)             smp_rmb()       (C)
+ *      STORE $data                     LOAD $data
+ *      smp_wmb()       (B)             smp_mb()        (D)
+ *      STORE ->data_head               STORE ->data_tail
+ *   }
+ *
+ *   Where A pairs with D, and B pairs with C.
+ *
+ *   In our case A is a control dependency that separates the
+ *   load of the ->data_tail and the stores of $data. In case
+ *   ->data_tail indicates there is no room in the buffer to
+ *   store $data we do not.
+ *
+ *   D needs to be a full barrier since it separates the data
+ *   READ from the tail WRITE.
+ *
+ *   For B a WMB is sufficient since it separates two WRITEs,
+ *   and for C an RMB is sufficient since it separates two READs.
+ *
+ * Note, instead of B, C, D we could also use smp_store_release()
+ * in B and D as well as smp_load_acquire() in C.
+ *
+ * However, this optimization does not make sense for all kernel
+ * supported architectures since for a fair number it would
+ * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
+ * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
+ *
+ * Thus for those smp_wmb() in B and smp_rmb() in C would still
+ * be less expensive. For the case of D this has either the same
+ * cost or is less expensive, for example, due to TSO x86 can
+ * avoid the CPU barrier entirely.
+ */
+static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
+{
+/*
+ * Architectures where smp_load_acquire() does not fallback to
+ * READ_ONCE() + smp_mb() pair.
+ */
+#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+        return smp_load_acquire(&base->data_head);
+#else
+        u64 head = READ_ONCE(base->data_head);
+        smp_rmb();
+        return head;
+#endif
+}
+static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
+                                          u64 tail)
+{
+        smp_store_release(&base->data_tail, tail);
+}
+#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */

diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h new file mode 100644 index 000000000000..9a083ae60473 --- /dev/null +++ b/tools/include/linux/ring_buffer.h
@@ -0,0 +1,73 @@
	1	#ifndef _TOOLS_LINUX_RING_BUFFER_H_
	2	#define _TOOLS_LINUX_RING_BUFFER_H_
	3
	4	#include <asm/barrier.h>
	5
	6	/*
	7	* Contract with kernel for walking the perf ring buffer from
	8	* user space requires the following barrier pairing (quote
	9	* from kernel/events/ring_buffer.c):
	10	*
	11	* Since the mmap() consumer (userspace) can run on a
	12	* different CPU:
	13	*
	14	* kernel user
	15	*
	16	* if (LOAD ->data_tail) { LOAD ->data_head
	17	* (A) smp_rmb() (C)
	18	* STORE $data LOAD $data
	19	* smp_wmb() (B) smp_mb() (D)
	20	* STORE ->data_head STORE ->data_tail
	21	* }
	22	*
	23	* Where A pairs with D, and B pairs with C.
	24	*
	25	* In our case A is a control dependency that separates the
	26	* load of the ->data_tail and the stores of $data. In case
	27	* ->data_tail indicates there is no room in the buffer to
	28	* store $data we do not.
	29	*
	30	* D needs to be a full barrier since it separates the data
	31	* READ from the tail WRITE.
	32	*
	33	* For B a WMB is sufficient since it separates two WRITEs,
	34	* and for C an RMB is sufficient since it separates two READs.
	35	*
	36	* Note, instead of B, C, D we could also use smp_store_release()
	37	* in B and D as well as smp_load_acquire() in C.
	38	*
	39	* However, this optimization does not make sense for all kernel
	40	* supported architectures since for a fair number it would
	41	* resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
	42	* and smp_mb() + WRITE_ONCE() pair for smp_store_release().
	43	*
	44	* Thus for those smp_wmb() in B and smp_rmb() in C would still
	45	* be less expensive. For the case of D this has either the same
	46	* cost or is less expensive, for example, due to TSO x86 can
	47	* avoid the CPU barrier entirely.
	48	*/
	49
	50	static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
	51	{
	52	/*
	53	* Architectures where smp_load_acquire() does not fallback to
	54	* READ_ONCE() + smp_mb() pair.
	55	*/
	56	#if defined(__x86_64__) \|\| defined(__aarch64__) \|\| defined(__powerpc64__) \|\| \
	57	defined(__ia64__) \|\| defined(__sparc__) && defined(__arch64__)
	58	return smp_load_acquire(&base->data_head);
	59	#else
	60	u64 head = READ_ONCE(base->data_head);
	61
	62	smp_rmb();
	63	return head;
	64	#endif
	65	}
	66
	67	static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
	68	u64 tail)
	69	{
	70	smp_store_release(&base->data_tail, tail);
	71	}
	72
	73	#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */