raid5: add AVX optimized RAID5 checksumming

Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Jim Kukunas <james.t.kukunas@linux.intel.com> 2012-05-21 23:54:04 -0400
committer: NeilBrown <neilb@suse.de> 2012-05-21 23:54:04 -0400
commit: ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree: 3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05 /arch/x86/include/asm/xor_avx.h
parent: 56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)
1 files changed, 214 insertions, 0 deletions
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifdef CONFIG_AS_AVX
+#include <linux/compiler.h>
+#include <asm/i387.h>
+#define ALIGN32 __aligned(32)
+#define YMM_SAVED_REGS 4
+#define YMMS_SAVE \
+do { \
+        preempt_disable(); \
+        cr0 = read_cr0(); \
+        clts(); \
+        asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+        asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+        asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+        asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+#define YMMS_RESTORE \
+do { \
+        asm volatile("sfence" : : : "memory"); \
+        asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+        asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+        asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+        asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+        write_cr0(cr0); \
+        preempt_enable(); \
+} while (0);
+#define BLOCK4(i) \
+                BLOCK(32 * i, 0) \
+                BLOCK(32 * (i + 1), 1) \
+                BLOCK(32 * (i + 2), 2) \
+                BLOCK(32 * (i + 3), 3)
+#define BLOCK16() \
+                BLOCK4(0) \
+                BLOCK4(4) \
+                BLOCK4(8) \
+                BLOCK4(12)
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16();
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+                p4 = (unsigned long *)((uintptr_t)p4 + 512);
+        }
+        YMMS_RESTORE
+}
+static struct xor_block_template xor_block_avx = {
+        .name = "avx",
+        .do_2 = xor_avx_2,
+        .do_3 = xor_avx_3,
+        .do_4 = xor_avx_4,
+        .do_5 = xor_avx_5,
+};
+#define AVX_XOR_SPEED \
+do { \
+        if (cpu_has_avx) \
+                xor_speed(&xor_block_avx); \
+} while (0)
+#define AVX_SELECT(FASTEST) \
+        (cpu_has_avx ? &xor_block_avx : FASTEST)
+#else
+#define AVX_XOR_SPEED {}
+#define AVX_SELECT(FASTEST) (FASTEST)
+#endif
+#endif
author	Jim Kukunas <james.t.kukunas@linux.intel.com>	2012-05-21 23:54:04 -0400
committer	NeilBrown <neilb@suse.de>	2012-05-21 23:54:04 -0400
commit	ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree	3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05 /arch/x86/include/asm/xor_avx.h
parent	56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)

diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
	1	#ifndef _ASM_X86_XOR_AVX_H
	2	#define _ASM_X86_XOR_AVX_H
	3
	4	/*
	5	* Optimized RAID-5 checksumming functions for AVX
	6	*
	7	* Copyright (C) 2012 Intel Corporation
	8	* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
	9	*
	10	* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
	11	*
	12	* This program is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU General Public License
	14	* as published by the Free Software Foundation; version 2
	15	* of the License.
	16	*/
	17
	18	#ifdef CONFIG_AS_AVX
	19
	20	#include <linux/compiler.h>
	21	#include <asm/i387.h>
	22
	23	#define ALIGN32 __aligned(32)
	24
	25	#define YMM_SAVED_REGS 4
	26
	27	#define YMMS_SAVE \
	28	do { \
	29	preempt_disable(); \
	30	cr0 = read_cr0(); \
	31	clts(); \
	32	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
	33	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
	34	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
	35	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
	36	} while (0);
	37
	38	#define YMMS_RESTORE \
	39	do { \
	40	asm volatile("sfence" : : : "memory"); \
	41	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
	42	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
	43	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
	44	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
	45	write_cr0(cr0); \
	46	preempt_enable(); \
	47	} while (0);
	48
	49	#define BLOCK4(i) \
	50	BLOCK(32 * i, 0) \
	51	BLOCK(32 * (i + 1), 1) \
	52	BLOCK(32 * (i + 2), 2) \
	53	BLOCK(32 * (i + 3), 3)
	54
	55	#define BLOCK16() \
	56	BLOCK4(0) \
	57	BLOCK4(4) \
	58	BLOCK4(8) \
	59	BLOCK4(12)
	60
	61	static void xor_avx_2(unsigned long bytes, unsigned long p0, unsigned long p1)
	62	{
	63	unsigned long cr0, lines = bytes >> 9;
	64	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
	65
	66	YMMS_SAVE
	67
	68	while (lines--) {
	69	#undef BLOCK
	70	#define BLOCK(i, reg) \
	71	do { \
	72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
	73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	74	"m" (p0[i / sizeof(*p0)])); \
	75	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	76	"=m" (p0[i / sizeof(*p0)])); \
	77	} while (0);
	78
	79	BLOCK16()
	80
	81	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	82	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	83	}
	84
	85	YMMS_RESTORE
	86	}
	87
	88	static void xor_avx_3(unsigned long bytes, unsigned long p0, unsigned long p1,
	89	unsigned long *p2)
	90	{
	91	unsigned long cr0, lines = bytes >> 9;
	92	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
	93
	94	YMMS_SAVE
	95
	96	while (lines--) {
	97	#undef BLOCK
	98	#define BLOCK(i, reg) \
	99	do { \
	100	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
	101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	102	"m" (p1[i / sizeof(*p1)])); \
	103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	104	"m" (p0[i / sizeof(*p0)])); \
	105	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	106	"=m" (p0[i / sizeof(*p0)])); \
	107	} while (0);
	108
	109	BLOCK16()
	110
	111	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	112	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	113	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	114	}
	115
	116	YMMS_RESTORE
	117	}
	118
	119	static void xor_avx_4(unsigned long bytes, unsigned long p0, unsigned long p1,
	120	unsigned long p2, unsigned long p3)
	121	{
	122	unsigned long cr0, lines = bytes >> 9;
	123	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
	124
	125	YMMS_SAVE
	126
	127	while (lines--) {
	128	#undef BLOCK
	129	#define BLOCK(i, reg) \
	130	do { \
	131	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
	132	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	133	"m" (p2[i / sizeof(*p2)])); \
	134	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	135	"m" (p1[i / sizeof(*p1)])); \
	136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	137	"m" (p0[i / sizeof(*p0)])); \
	138	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	139	"=m" (p0[i / sizeof(*p0)])); \
	140	} while (0);
	141
	142	BLOCK16();
	143
	144	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	145	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	146	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	147	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	148	}
	149
	150	YMMS_RESTORE
	151	}
	152
	153	static void xor_avx_5(unsigned long bytes, unsigned long p0, unsigned long p1,
	154	unsigned long p2, unsigned long p3, unsigned long *p4)
	155	{
	156	unsigned long cr0, lines = bytes >> 9;
	157	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
	158
	159	YMMS_SAVE
	160
	161	while (lines--) {
	162	#undef BLOCK
	163	#define BLOCK(i, reg) \
	164	do { \
	165	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
	166	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	167	"m" (p3[i / sizeof(*p3)])); \
	168	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	169	"m" (p2[i / sizeof(*p2)])); \
	170	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	171	"m" (p1[i / sizeof(*p1)])); \
	172	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	173	"m" (p0[i / sizeof(*p0)])); \
	174	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	175	"=m" (p0[i / sizeof(*p0)])); \
	176	} while (0);
	177
	178	BLOCK16()
	179
	180	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	181	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	182	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	183	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	184	p4 = (unsigned long *)((uintptr_t)p4 + 512);
	185	}
	186
	187	YMMS_RESTORE
	188	}
	189
	190	static struct xor_block_template xor_block_avx = {
	191	.name = "avx",
	192	.do_2 = xor_avx_2,
	193	.do_3 = xor_avx_3,
	194	.do_4 = xor_avx_4,
	195	.do_5 = xor_avx_5,
	196	};
	197
	198	#define AVX_XOR_SPEED \
	199	do { \
	200	if (cpu_has_avx) \
	201	xor_speed(&xor_block_avx); \
	202	} while (0)
	203
	204	#define AVX_SELECT(FASTEST) \
	205	(cpu_has_avx ? &xor_block_avx : FASTEST)
	206
	207	#else
	208
	209	#define AVX_XOR_SPEED {}
	210
	211	#define AVX_SELECT(FASTEST) (FASTEST)
	212
	213	#endif
	214	#endif