raid5: add AVX optimized RAID5 checksumming

Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Jim Kukunas <james.t.kukunas@linux.intel.com> 2012-05-21 23:54:04 -0400
committer: NeilBrown <neilb@suse.de> 2012-05-21 23:54:04 -0400
commit: ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree: 3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05
parent: 56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)
4 files changed, 229 insertions, 4 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a3..7a1cc9ee5c8a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
 LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
        .do_5 = xor_sse_5,
 };
+/* Also try the AVX routines */
+#include "xor_avx.h"
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
@@ -871,6 +874,7 @@ do {							\
        xor_speed(&xor_block_8regs_p);                  \
        xor_speed(&xor_block_32regs);                   \
        xor_speed(&xor_block_32regs_p);                 \
+        AVX_XOR_SPEED;                                  \
        if (cpu_has_xmm)                                \
                xor_speed(&xor_block_pIII_sse);         \
        if (cpu_has_mmx) {                              \
@@ -883,6 +887,6 @@ do {							\
   We may also be able to load into the L1 only depending on how the cpu
   deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST)                    \
-        (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+        AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
        .do_5 = xor_sse_5,
 };
+/* Also try the AVX routines */
+#include "xor_avx.h"
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                       \
 do {                                            \
+        AVX_XOR_SPEED;                          \
        xor_speed(&xor_block_sse);              \
 } while (0)
 /* We force the use of the SSE xor block because it can write around L2.
   We may also be able to load into the L1 only depending on how the cpu
   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+        AVX_SELECT(&xor_block_sse)
 #endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifdef CONFIG_AS_AVX
+#include <linux/compiler.h>
+#include <asm/i387.h>
+#define ALIGN32 __aligned(32)
+#define YMM_SAVED_REGS 4
+#define YMMS_SAVE \
+do { \
+        preempt_disable(); \
+        cr0 = read_cr0(); \
+        clts(); \
+        asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+        asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+        asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+        asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+#define YMMS_RESTORE \
+do { \
+        asm volatile("sfence" : : : "memory"); \
+        asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+        asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+        asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+        asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+        write_cr0(cr0); \
+        preempt_enable(); \
+} while (0);
+#define BLOCK4(i) \
+                BLOCK(32 * i, 0) \
+                BLOCK(32 * (i + 1), 1) \
+                BLOCK(32 * (i + 2), 2) \
+                BLOCK(32 * (i + 3), 3)
+#define BLOCK16() \
+                BLOCK4(0) \
+                BLOCK4(4) \
+                BLOCK4(8) \
+                BLOCK4(12)
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16();
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+                p4 = (unsigned long *)((uintptr_t)p4 + 512);
+        }
+        YMMS_RESTORE
+}
+static struct xor_block_template xor_block_avx = {
+        .name = "avx",
+        .do_2 = xor_avx_2,
+        .do_3 = xor_avx_3,
+        .do_4 = xor_avx_4,
+        .do_5 = xor_avx_5,
+};
+#define AVX_XOR_SPEED \
+do { \
+        if (cpu_has_avx) \
+                xor_speed(&xor_block_avx); \
+} while (0)
+#define AVX_SELECT(FASTEST) \
+        (cpu_has_avx ? &xor_block_avx : FASTEST)
+#else
+#define AVX_XOR_SPEED {}
+#define AVX_SELECT(FASTEST) (FASTEST)
+#endif
+#endif
author	Jim Kukunas <james.t.kukunas@linux.intel.com>	2012-05-21 23:54:04 -0400
committer	NeilBrown <neilb@suse.de>	2012-05-21 23:54:04 -0400
commit	ea4d26ae24e58fbd2c61de9242adab053cb982d8 (patch)
tree	3115dd168f0cf1eb1eb5dd6aecc385cfa0e8bc05
parent	56a519913eeba2bdae4d7ee39e80fab442c3836c (diff)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..7a1cc9ee5c8a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
115		115
116	# does binutils support specific instructions?	116	# does binutils support specific instructions?
117	asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)	117	asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
		118	avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
118		119
119	KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)	120	KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
120	KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)	121	KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
121		122
122	LDFLAGS := -m elf_$(UTS_MACHINE)	123	LDFLAGS := -m elf_$(UTS_MACHINE)
123		124


diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
861	.do_5 = xor_sse_5,	861	.do_5 = xor_sse_5,
862	};	862	};
863		863
		864	/* Also try the AVX routines */
		865	#include "xor_avx.h"
		866
864	/* Also try the generic routines. */	867	/* Also try the generic routines. */
865	#include <asm-generic/xor.h>	868	#include <asm-generic/xor.h>
866		869
@@ -871,6 +874,7 @@ do { \
871	xor_speed(&xor_block_8regs_p); \	874	xor_speed(&xor_block_8regs_p); \
872	xor_speed(&xor_block_32regs); \	875	xor_speed(&xor_block_32regs); \
873	xor_speed(&xor_block_32regs_p); \	876	xor_speed(&xor_block_32regs_p); \
		877	AVX_XOR_SPEED; \
874	if (cpu_has_xmm) \	878	if (cpu_has_xmm) \
875	xor_speed(&xor_block_pIII_sse); \	879	xor_speed(&xor_block_pIII_sse); \
876	if (cpu_has_mmx) { \	880	if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
883	We may also be able to load into the L1 only depending on how the cpu	887	We may also be able to load into the L1 only depending on how the cpu
884	deals with a load to a line that is being prefetched. */	888	deals with a load to a line that is being prefetched. */
885	#define XOR_SELECT_TEMPLATE(FASTEST) \	889	#define XOR_SELECT_TEMPLATE(FASTEST) \
886	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)	890	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887		891
888	#endif /* _ASM_X86_XOR_32_H */	892	#endif /* _ASM_X86_XOR_32_H */


diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
347	.do_5 = xor_sse_5,	347	.do_5 = xor_sse_5,
348	};	348	};
349		349
		350
		351	/* Also try the AVX routines */
		352	#include "xor_avx.h"
		353
350	#undef XOR_TRY_TEMPLATES	354	#undef XOR_TRY_TEMPLATES
351	#define XOR_TRY_TEMPLATES \	355	#define XOR_TRY_TEMPLATES \
352	do { \	356	do { \
		357	AVX_XOR_SPEED; \
353	xor_speed(&xor_block_sse); \	358	xor_speed(&xor_block_sse); \
354	} while (0)	359	} while (0)
355		360
356	/* We force the use of the SSE xor block because it can write around L2.	361	/* We force the use of the SSE xor block because it can write around L2.
357	We may also be able to load into the L1 only depending on how the cpu	362	We may also be able to load into the L1 only depending on how the cpu
358	deals with a load to a line that is being prefetched. */	363	deals with a load to a line that is being prefetched. */
359	#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)	364	#define XOR_SELECT_TEMPLATE(FASTEST) \
		365	AVX_SELECT(&xor_block_sse)
360		366
361	#endif /* _ASM_X86_XOR_64_H */	367	#endif /* _ASM_X86_XOR_64_H */


diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
		1	#ifndef _ASM_X86_XOR_AVX_H
		2	#define _ASM_X86_XOR_AVX_H
		3
		4	/*
		5	* Optimized RAID-5 checksumming functions for AVX
		6	*
		7	* Copyright (C) 2012 Intel Corporation
		8	* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
		9	*
		10	* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
		11	*
		12	* This program is free software; you can redistribute it and/or
		13	* modify it under the terms of the GNU General Public License
		14	* as published by the Free Software Foundation; version 2
		15	* of the License.
		16	*/
		17
		18	#ifdef CONFIG_AS_AVX
		19
		20	#include <linux/compiler.h>
		21	#include <asm/i387.h>
		22
		23	#define ALIGN32 __aligned(32)
		24
		25	#define YMM_SAVED_REGS 4
		26
		27	#define YMMS_SAVE \
		28	do { \
		29	preempt_disable(); \
		30	cr0 = read_cr0(); \
		31	clts(); \
		32	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
		33	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
		34	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
		35	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
		36	} while (0);
		37
		38	#define YMMS_RESTORE \
		39	do { \
		40	asm volatile("sfence" : : : "memory"); \
		41	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
		42	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
		43	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
		44	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
		45	write_cr0(cr0); \
		46	preempt_enable(); \
		47	} while (0);
		48
		49	#define BLOCK4(i) \
		50	BLOCK(32 * i, 0) \
		51	BLOCK(32 * (i + 1), 1) \
		52	BLOCK(32 * (i + 2), 2) \
		53	BLOCK(32 * (i + 3), 3)
		54
		55	#define BLOCK16() \
		56	BLOCK4(0) \
		57	BLOCK4(4) \
		58	BLOCK4(8) \
		59	BLOCK4(12)
		60
		61	static void xor_avx_2(unsigned long bytes, unsigned long p0, unsigned long p1)
		62	{
		63	unsigned long cr0, lines = bytes >> 9;
		64	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		65
		66	YMMS_SAVE
		67
		68	while (lines--) {
		69	#undef BLOCK
		70	#define BLOCK(i, reg) \
		71	do { \
		72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
		73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		74	"m" (p0[i / sizeof(*p0)])); \
		75	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		76	"=m" (p0[i / sizeof(*p0)])); \
		77	} while (0);
		78
		79	BLOCK16()
		80
		81	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		82	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		83	}
		84
		85	YMMS_RESTORE
		86	}
		87
		88	static void xor_avx_3(unsigned long bytes, unsigned long p0, unsigned long p1,
		89	unsigned long *p2)
		90	{
		91	unsigned long cr0, lines = bytes >> 9;
		92	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		93
		94	YMMS_SAVE
		95
		96	while (lines--) {
		97	#undef BLOCK
		98	#define BLOCK(i, reg) \
		99	do { \
		100	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
		101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		102	"m" (p1[i / sizeof(*p1)])); \
		103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		104	"m" (p0[i / sizeof(*p0)])); \
		105	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		106	"=m" (p0[i / sizeof(*p0)])); \
		107	} while (0);
		108
		109	BLOCK16()
		110
		111	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		112	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		113	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		114	}
		115
		116	YMMS_RESTORE
		117	}
		118
		119	static void xor_avx_4(unsigned long bytes, unsigned long p0, unsigned long p1,
		120	unsigned long p2, unsigned long p3)
		121	{
		122	unsigned long cr0, lines = bytes >> 9;
		123	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		124
		125	YMMS_SAVE
		126
		127	while (lines--) {
		128	#undef BLOCK
		129	#define BLOCK(i, reg) \
		130	do { \
		131	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
		132	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		133	"m" (p2[i / sizeof(*p2)])); \
		134	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		135	"m" (p1[i / sizeof(*p1)])); \
		136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		137	"m" (p0[i / sizeof(*p0)])); \
		138	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		139	"=m" (p0[i / sizeof(*p0)])); \
		140	} while (0);
		141
		142	BLOCK16();
		143
		144	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		145	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		146	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		147	p3 = (unsigned long *)((uintptr_t)p3 + 512);
		148	}
		149
		150	YMMS_RESTORE
		151	}
		152
		153	static void xor_avx_5(unsigned long bytes, unsigned long p0, unsigned long p1,
		154	unsigned long p2, unsigned long p3, unsigned long *p4)
		155	{
		156	unsigned long cr0, lines = bytes >> 9;
		157	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		158
		159	YMMS_SAVE
		160
		161	while (lines--) {
		162	#undef BLOCK
		163	#define BLOCK(i, reg) \
		164	do { \
		165	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
		166	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		167	"m" (p3[i / sizeof(*p3)])); \
		168	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		169	"m" (p2[i / sizeof(*p2)])); \
		170	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		171	"m" (p1[i / sizeof(*p1)])); \
		172	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		173	"m" (p0[i / sizeof(*p0)])); \
		174	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		175	"=m" (p0[i / sizeof(*p0)])); \
		176	} while (0);
		177
		178	BLOCK16()
		179
		180	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		181	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		182	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		183	p3 = (unsigned long *)((uintptr_t)p3 + 512);
		184	p4 = (unsigned long *)((uintptr_t)p4 + 512);
		185	}
		186
		187	YMMS_RESTORE
		188	}
		189
		190	static struct xor_block_template xor_block_avx = {
		191	.name = "avx",
		192	.do_2 = xor_avx_2,
		193	.do_3 = xor_avx_3,
		194	.do_4 = xor_avx_4,
		195	.do_5 = xor_avx_5,
		196	};
		197
		198	#define AVX_XOR_SPEED \
		199	do { \
		200	if (cpu_has_avx) \
		201	xor_speed(&xor_block_avx); \
		202	} while (0)
		203
		204	#define AVX_SELECT(FASTEST) \
		205	(cpu_has_avx ? &xor_block_avx : FASTEST)
		206
		207	#else
		208
		209	#define AVX_XOR_SPEED {}
		210
		211	#define AVX_SELECT(FASTEST) (FASTEST)
		212
		213	#endif
		214	#endif