powerpc: Add VMX optimised xor for RAID5

Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Anton Blanchard <anton@samba.org> 2013-10-14 06:03:58 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-30 01:02:28 -0400
commit: ef1313deafb7baa6d3382044e962d5ad5e8c8dd6 (patch)
tree: 30584552d8b2907bb8360a7d4e1cab28e3215585 /arch/powerpc/lib
parent: 07fb41a7525539d7ad37c25f2a2689fd95a6ab68 (diff)
2 files changed, 180 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 5310132856c1..95a20e17dbff 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,3 +39,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
 obj-y                   += code-patching.o
 obj-y                   += feature-fixups.o
 obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
+obj-$(CONFIG_ALTIVEC)   += xor_vmx.o
+CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c
new file mode 100644
index 000000000000..e905f7c2ea7b
--- /dev/null
+++ b/arch/powerpc/lib/xor_vmx.c
@@ -0,0 +1,177 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <altivec.h>
+#include <linux/preempt.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <asm/switch_to.h>
+typedef vector signed char unative_t;
+#define DEFINE(V)                               \
+        unative_t *V = (unative_t *)V##_in;     \
+        unative_t V##_0, V##_1, V##_2, V##_3
+#define LOAD(V)                 \
+        do {                    \
+                V##_0 = V[0];   \
+                V##_1 = V[1];   \
+                V##_2 = V[2];   \
+                V##_3 = V[3];   \
+        } while (0)
+#define STORE(V)                \
+        do {                    \
+                V[0] = V##_0;   \
+                V[1] = V##_1;   \
+                V[2] = V##_2;   \
+                V[3] = V##_3;   \
+        } while (0)
+#define XOR(V1, V2)                                     \
+        do {                                            \
+                V1##_0 = vec_xor(V1##_0, V2##_0);       \
+                V1##_1 = vec_xor(V1##_1, V2##_1);       \
+                V1##_2 = vec_xor(V1##_2, V2##_2);       \
+                V1##_3 = vec_xor(V1##_3, V2##_3);       \
+        } while (0)
+void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+                   unsigned long *v2_in)
+{
+        DEFINE(v1);
+        DEFINE(v2);
+        unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+        preempt_disable();
+        enable_kernel_altivec();
+        do {
+                LOAD(v1);
+                LOAD(v2);
+                XOR(v1, v2);
+                STORE(v1);
+                v1 += 4;
+                v2 += 4;
+        } while (--lines > 0);
+        preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_2);
+void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+                   unsigned long *v2_in, unsigned long *v3_in)
+{
+        DEFINE(v1);
+        DEFINE(v2);
+        DEFINE(v3);
+        unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+        preempt_disable();
+        enable_kernel_altivec();
+        do {
+                LOAD(v1);
+                LOAD(v2);
+                LOAD(v3);
+                XOR(v1, v2);
+                XOR(v1, v3);
+                STORE(v1);
+                v1 += 4;
+                v2 += 4;
+                v3 += 4;
+        } while (--lines > 0);
+        preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_3);
+void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+                   unsigned long *v2_in, unsigned long *v3_in,
+                   unsigned long *v4_in)
+{
+        DEFINE(v1);
+        DEFINE(v2);
+        DEFINE(v3);
+        DEFINE(v4);
+        unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+        preempt_disable();
+        enable_kernel_altivec();
+        do {
+                LOAD(v1);
+                LOAD(v2);
+                LOAD(v3);
+                LOAD(v4);
+                XOR(v1, v2);
+                XOR(v3, v4);
+                XOR(v1, v3);
+                STORE(v1);
+                v1 += 4;
+                v2 += 4;
+                v3 += 4;
+                v4 += 4;
+        } while (--lines > 0);
+        preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_4);
+void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+                   unsigned long *v2_in, unsigned long *v3_in,
+                   unsigned long *v4_in, unsigned long *v5_in)
+{
+        DEFINE(v1);
+        DEFINE(v2);
+        DEFINE(v3);
+        DEFINE(v4);
+        DEFINE(v5);
+        unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+        preempt_disable();
+        enable_kernel_altivec();
+        do {
+                LOAD(v1);
+                LOAD(v2);
+                LOAD(v3);
+                LOAD(v4);
+                LOAD(v5);
+                XOR(v1, v2);
+                XOR(v3, v4);
+                XOR(v1, v5);
+                XOR(v1, v3);
+                STORE(v1);
+                v1 += 4;
+                v2 += 4;
+                v3 += 4;
+                v4 += 4;
+                v5 += 4;
+        } while (--lines > 0);
+        preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_5);
author	Anton Blanchard <anton@samba.org>	2013-10-14 06:03:58 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2013-10-30 01:02:28 -0400
commit	ef1313deafb7baa6d3382044e962d5ad5e8c8dd6 (patch)
tree	30584552d8b2907bb8360a7d4e1cab28e3215585 /arch/powerpc/lib
parent	07fb41a7525539d7ad37c25f2a2689fd95a6ab68 (diff)

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 5310132856c1..95a20e17dbff 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile
@@ -39,3 +39,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
39	obj-y += code-patching.o	39	obj-y += code-patching.o
40	obj-y += feature-fixups.o	40	obj-y += feature-fixups.o
41	obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o	41	obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
		42
		43	obj-$(CONFIG_ALTIVEC) += xor_vmx.o
		44	CFLAGS_xor_vmx.o += -maltivec -mabi=altivec


diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 000000000000..e905f7c2ea7b --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c
@@ -0,0 +1,177 @@
		1	/*
		2	* This program is free software; you can redistribute it and/or modify
		3	* it under the terms of the GNU General Public License as published by
		4	* the Free Software Foundation; either version 2 of the License, or
		5	* (at your option) any later version.
		6	*
		7	* This program is distributed in the hope that it will be useful,
		8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		10	* GNU General Public License for more details.
		11	*
		12	* You should have received a copy of the GNU General Public License
		13	* along with this program; if not, write to the Free Software
		14	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
		15	*
		16	* Copyright (C) IBM Corporation, 2012
		17	*
		18	* Author: Anton Blanchard <anton@au.ibm.com>
		19	*/
		20	#include <altivec.h>
		21
		22	#include <linux/preempt.h>
		23	#include <linux/export.h>
		24	#include <linux/sched.h>
		25	#include <asm/switch_to.h>
		26
		27	typedef vector signed char unative_t;
		28
		29	#define DEFINE(V) \
		30	unative_t V = (unative_t )V##_in; \
		31	unative_t V##_0, V##_1, V##_2, V##_3
		32
		33	#define LOAD(V) \
		34	do { \
		35	V##_0 = V[0]; \
		36	V##_1 = V[1]; \
		37	V##_2 = V[2]; \
		38	V##_3 = V[3]; \
		39	} while (0)
		40
		41	#define STORE(V) \
		42	do { \
		43	V[0] = V##_0; \
		44	V[1] = V##_1; \
		45	V[2] = V##_2; \
		46	V[3] = V##_3; \
		47	} while (0)
		48
		49	#define XOR(V1, V2) \
		50	do { \
		51	V1##_0 = vec_xor(V1##_0, V2##_0); \
		52	V1##_1 = vec_xor(V1##_1, V2##_1); \
		53	V1##_2 = vec_xor(V1##_2, V2##_2); \
		54	V1##_3 = vec_xor(V1##_3, V2##_3); \
		55	} while (0)
		56
		57	void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
		58	unsigned long *v2_in)
		59	{
		60	DEFINE(v1);
		61	DEFINE(v2);
		62	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
		63
		64	preempt_disable();
		65	enable_kernel_altivec();
		66
		67	do {
		68	LOAD(v1);
		69	LOAD(v2);
		70	XOR(v1, v2);
		71	STORE(v1);
		72
		73	v1 += 4;
		74	v2 += 4;
		75	} while (--lines > 0);
		76
		77	preempt_enable();
		78	}
		79	EXPORT_SYMBOL(xor_altivec_2);
		80
		81	void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
		82	unsigned long v2_in, unsigned long v3_in)
		83	{
		84	DEFINE(v1);
		85	DEFINE(v2);
		86	DEFINE(v3);
		87	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
		88
		89	preempt_disable();
		90	enable_kernel_altivec();
		91
		92	do {
		93	LOAD(v1);
		94	LOAD(v2);
		95	LOAD(v3);
		96	XOR(v1, v2);
		97	XOR(v1, v3);
		98	STORE(v1);
		99
		100	v1 += 4;
		101	v2 += 4;
		102	v3 += 4;
		103	} while (--lines > 0);
		104
		105	preempt_enable();
		106	}
		107	EXPORT_SYMBOL(xor_altivec_3);
		108
		109	void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
		110	unsigned long v2_in, unsigned long v3_in,
		111	unsigned long *v4_in)
		112	{
		113	DEFINE(v1);
		114	DEFINE(v2);
		115	DEFINE(v3);
		116	DEFINE(v4);
		117	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
		118
		119	preempt_disable();
		120	enable_kernel_altivec();
		121
		122	do {
		123	LOAD(v1);
		124	LOAD(v2);
		125	LOAD(v3);
		126	LOAD(v4);
		127	XOR(v1, v2);
		128	XOR(v3, v4);
		129	XOR(v1, v3);
		130	STORE(v1);
		131
		132	v1 += 4;
		133	v2 += 4;
		134	v3 += 4;
		135	v4 += 4;
		136	} while (--lines > 0);
		137
		138	preempt_enable();
		139	}
		140	EXPORT_SYMBOL(xor_altivec_4);
		141
		142	void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
		143	unsigned long v2_in, unsigned long v3_in,
		144	unsigned long v4_in, unsigned long v5_in)
		145	{
		146	DEFINE(v1);
		147	DEFINE(v2);
		148	DEFINE(v3);
		149	DEFINE(v4);
		150	DEFINE(v5);
		151	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
		152
		153	preempt_disable();
		154	enable_kernel_altivec();
		155
		156	do {
		157	LOAD(v1);
		158	LOAD(v2);
		159	LOAD(v3);
		160	LOAD(v4);
		161	LOAD(v5);
		162	XOR(v1, v2);
		163	XOR(v3, v4);
		164	XOR(v1, v5);
		165	XOR(v1, v3);
		166	STORE(v1);
		167
		168	v1 += 4;
		169	v2 += 4;
		170	v3 += 4;
		171	v4 += 4;
		172	v5 += 4;
		173	} while (--lines > 0);
		174
		175	preempt_enable();
		176	}
		177	EXPORT_SYMBOL(xor_altivec_5);