From 19df0c2fef010e94e90df514aaf4e73f6b80145c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 14:26:50 +0100
Subject: percpu: align percpu readmostly subsection to cacheline

Currently percpu readmostly subsection may share cachelines with other
percpu subsections which may result in unnecessary cacheline bounce
and performance degradation.

This patch adds @cacheline parameter to PERCPU() and PERCPU_VADDR()
linker macros, makes each arch linker scripts specify its cacheline
size and use it to align percpu subsections.

This is based on Shaohua's x86 only patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shaohua.li@intel.com>
---
 include/asm-generic/vmlinux.lds.h | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6ebb81030d2d..439df587c12c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -15,7 +15,7 @@
  *	HEAD_TEXT_SECTION
  *	INIT_TEXT_SECTION(PAGE_SIZE)
  *	INIT_DATA_SECTION(...)
- *	PERCPU(PAGE_SIZE)
+ *	PERCPU(CACHELINE_SIZE, PAGE_SIZE)
  *	__init_end = .;
  *
  *	_stext = .;
@@ -683,13 +683,18 @@
 
 /**
  * PERCPU_VADDR - define output section for percpu area
+ * @cacheline: cacheline size
  * @vaddr: explicit base address (optional)
  * @phdr: destination PHDR (optional)
  *
- * Macro which expands to output section for percpu area.  If @vaddr
- * is not blank, it specifies explicit base address and all percpu
- * symbols will be offset from the given address.  If blank, @vaddr
- * always equals @laddr + LOAD_OFFSET.
+ * Macro which expands to output section for percpu area.
+ *
+ * @cacheline is used to align subsections to avoid false cacheline
+ * sharing between subsections for different purposes.
+ *
+ * If @vaddr is not blank, it specifies explicit base address and all
+ * percpu symbols will be offset from the given address.  If blank,
+ * @vaddr always equals @laddr + LOAD_OFFSET.
  *
  * @phdr defines the output PHDR to use if not blank.  Be warned that
  * output PHDR is sticky.  If @phdr is specified, the next output
@@ -700,7 +705,7 @@
  * If there is no need to put the percpu section at a predetermined
  * address, use PERCPU().
  */
-#define PERCPU_VADDR(vaddr, phdr)					\
+#define PERCPU_VADDR(cacheline, vaddr, phdr)				\
 	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
 	.data..percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
 				- LOAD_OFFSET) {			\
@@ -708,7 +713,9 @@
 		*(.data..percpu..first)					\
 		. = ALIGN(PAGE_SIZE);					\
 		*(.data..percpu..page_aligned)				\
+		. = ALIGN(cacheline);					\
 		*(.data..percpu..readmostly)				\
+		. = ALIGN(cacheline);					\
 		*(.data..percpu)					\
 		*(.data..percpu..shared_aligned)			\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
@@ -717,18 +724,18 @@
 
 /**
  * PERCPU - define output section for percpu area, simple version
+ * @cacheline: cacheline size
  * @align: required alignment
  *
- * Align to @align and outputs output section for percpu area.  This
- * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+ * Align to @align and outputs output section for percpu area.  This macro
+ * doesn't manipulate @vaddr or @phdr and __per_cpu_load and
  * __per_cpu_start will be identical.
  *
- * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
- * that __per_cpu_load is defined as a relative symbol against
- * .data..percpu which is required for relocatable x86_32
- * configuration.
+ * This macro is equivalent to ALIGN(@align); PERCPU_VADDR(@cacheline,,)
+ * except that __per_cpu_load is defined as a relative symbol against
+ * .data..percpu which is required for relocatable x86_32 configuration.
  */
-#define PERCPU(align)							\
+#define PERCPU(cacheline, align)					\
 	. = ALIGN(align);						\
 	.data..percpu	: AT(ADDR(.data..percpu) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__per_cpu_load) = .;			\
@@ -736,7 +743,9 @@
 		*(.data..percpu..first)					\
 		. = ALIGN(PAGE_SIZE);					\
 		*(.data..percpu..page_aligned)				\
+		. = ALIGN(cacheline);					\
 		*(.data..percpu..readmostly)				\
+		. = ALIGN(cacheline);					\
 		*(.data..percpu)					\
 		*(.data..percpu..shared_aligned)			\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
-- 
cgit v1.2.2


From 7c3343392172ba98d9d90a83edcc4c2e80897009 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 28 Feb 2011 11:02:24 +0100
Subject: percpu: Generic support for this_cpu_cmpxchg_double()

Introduce this_cpu_cmpxchg_double().  this_cpu_cmpxchg_double() allows
the comparison between two consecutive words and replaces them if
there is a match.

	bool this_cpu_cmpxchg_double(pcp1, pcp2,
		old_word1, old_word2, new_word1, new_word2)

this_cpu_cmpxchg_double does not return the old value (difficult since
there are two words) but a boolean indicating if the operation was
successful.

The first percpu variable must be double word aligned!

-tj: Updated to return bool instead of int, converted size check to
     BUILD_BUG_ON() instead of VM_BUG_ON() and other cosmetic changes.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h | 128 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 27c3c6fcfad3..3a5c4449fd36 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -255,6 +255,30 @@ extern void __bad_size_call_parameter(void);
 	pscr2_ret__;							\
 })
 
+/*
+ * Special handling for cmpxchg_double.  cmpxchg_double is passed two
+ * percpu variables.  The first has to be aligned to a double word
+ * boundary and the second has to follow directly thereafter.
+ */
+#define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)		\
+({									\
+	bool pdcrb_ret__;						\
+	__verify_pcpu_ptr(&pcp1);					\
+	BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2));			\
+	VM_BUG_ON((unsigned long)(&pcp1) % (2 * sizeof(pcp1)));		\
+	VM_BUG_ON((unsigned long)(&pcp2) !=				\
+		  (unsigned long)(&pcp1) + sizeof(pcp1));		\
+	switch(sizeof(pcp1)) {						\
+	case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break;	\
+	case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break;	\
+	default:							\
+		__bad_size_call_parameter(); break;			\
+	}								\
+	pdcrb_ret__;							\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -500,6 +524,45 @@ do {									\
 	__pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
 #endif
 
+/*
+ * cmpxchg_double replaces two adjacent scalars at once.  The first
+ * two parameters are per cpu variables which have to be of the same
+ * size.  A truth value is returned to indicate success or failure
+ * (since a double register result is difficult to handle).  There is
+ * very limited hardware support for these operations, so only certain
+ * sizes may work.
+ */
+#define _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	preempt_disable();						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
+			oval1, oval2, nval1, nval2);			\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg_double
+# ifndef this_cpu_cmpxchg_double_1
+#  define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_2
+#  define this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_4
+#  define this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_8
+#  define this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))
+#endif
+
 /*
  * Generic percpu operations that do not require preemption handling.
  * Either we do not care about races or the caller has the
@@ -703,6 +766,39 @@ do {									\
 	__pcpu_size_call_return2(__this_cpu_cmpxchg_, pcp, oval, nval)
 #endif
 
+#define __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int __ret = 0;							\
+	if (__this_cpu_read(pcp1) == (oval1) &&				\
+			 __this_cpu_read(pcp2)  == (oval2)) {		\
+		__this_cpu_write(pcp1, (nval1));			\
+		__this_cpu_write(pcp2, (nval2));			\
+		__ret = 1;						\
+	}								\
+	(__ret);							\
+})
+
+#ifndef __this_cpu_cmpxchg_double
+# ifndef __this_cpu_cmpxchg_double_1
+#  define __this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_2
+#  define __this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_4
+#  define __this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_8
+#  define __this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_bool(__this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))
+#endif
+
 /*
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another
@@ -823,4 +919,36 @@ do {									\
 	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
 #endif
 
+#define irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
+			oval1, oval2, nval1, nval2);			\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg_double
+# ifndef irqsafe_cpu_cmpxchg_double_1
+#  define irqsafe_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_2
+#  define irqsafe_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_4
+#  define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_8
+#  define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define irqsafe_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))
+#endif
+
 #endif /* __LINUX_PERCPU_H */
-- 
cgit v1.2.2


From 1a757fe5d4234293d6a3acccd7196f1386443956 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 25 Feb 2011 11:38:51 -0600
Subject: slub: min_partial needs to be in first cacheline

It is used in unfreeze_slab() which is a performance critical
function.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8b6e8ae5d5ca..875df55ab36d 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -70,6 +70,7 @@ struct kmem_cache {
 	struct kmem_cache_cpu __percpu *cpu_slab;
 	/* Used for retriving partial slabs etc */
 	unsigned long flags;
+	unsigned long min_partial;
 	int size;		/* The size of an object including meta data */
 	int objsize;		/* The size of an object without meta data */
 	int offset;		/* Free pointer offset. */
@@ -83,7 +84,6 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
-	unsigned long min_partial;
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.2


From 8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 25 Feb 2011 11:38:54 -0600
Subject: Lockless (and preemptless) fastpaths for slub

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorithm on arches that support fast this_cpu_ops.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.

This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at the end.

So there is no need even to disable preemption.

Test results show that the fastpath cycle count is reduced by up to ~ 40%
(alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
adds a few cycles.

Sadly this does nothing for the slowpath which is where the main issues with
performance in slub are but the best case performance rises significantly.
(For that see the more complex slub patches that require cmpxchg_double)

Kmalloc: alloc/free test

Before:

10000 times kmalloc(8)/kfree -> 134 cycles
10000 times kmalloc(16)/kfree -> 152 cycles
10000 times kmalloc(32)/kfree -> 144 cycles
10000 times kmalloc(64)/kfree -> 142 cycles
10000 times kmalloc(128)/kfree -> 142 cycles
10000 times kmalloc(256)/kfree -> 132 cycles
10000 times kmalloc(512)/kfree -> 132 cycles
10000 times kmalloc(1024)/kfree -> 135 cycles
10000 times kmalloc(2048)/kfree -> 135 cycles
10000 times kmalloc(4096)/kfree -> 135 cycles
10000 times kmalloc(8192)/kfree -> 144 cycles
10000 times kmalloc(16384)/kfree -> 754 cycles

After:

10000 times kmalloc(8)/kfree -> 78 cycles
10000 times kmalloc(16)/kfree -> 78 cycles
10000 times kmalloc(32)/kfree -> 82 cycles
10000 times kmalloc(64)/kfree -> 88 cycles
10000 times kmalloc(128)/kfree -> 79 cycles
10000 times kmalloc(256)/kfree -> 79 cycles
10000 times kmalloc(512)/kfree -> 85 cycles
10000 times kmalloc(1024)/kfree -> 82 cycles
10000 times kmalloc(2048)/kfree -> 82 cycles
10000 times kmalloc(4096)/kfree -> 85 cycles
10000 times kmalloc(8192)/kfree -> 82 cycles
10000 times kmalloc(16384)/kfree -> 706 cycles

Kmalloc: Repeatedly allocate then free test

Before:

10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles
10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles
10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles
10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles
10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles
10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles
10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles
10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles
10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles
10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles
10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles
10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles

After:

10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles
10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles
10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles
10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles
10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles
10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles
10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles
10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles
10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles
10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles
10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles
10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 875df55ab36d..009b0020079d 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -35,7 +35,10 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;	/* Pointer to next available object */
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;	/* Globally unique transaction id */
+#endif
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
-- 
cgit v1.2.2