Merge tag 'md-3.5' of git://neil.brown.name/md

Pull md updates from NeilBrown: "It's been a busy cycle for md - lots of fun stuff here.. if you like this kind of thing :-) Main features: - RAID10 arrays can be reshaped - adding and removing devices and changing chunks (not 'far' array though) - allow RAID5 arrays to be reshaped with a backup file (not tested yet, but the priciple works fine for RAID10). - arrays can be reshaped while a bitmap is present - you no longer need to remove it first - SSSE3 support for RAID6 syndrome calculations and of course a number of minor fixes etc." * tag 'md-3.5' of git://neil.brown.name/md: (56 commits) md/bitmap: record the space available for the bitmap in the superblock. md/raid10: Remove extras after reshape to smaller number of devices. md/raid5: improve removal of extra devices after reshape. md: check the return of mddev_find() MD RAID1: Further conditionalize 'fullsync' DM RAID: Use md_error() in place of simply setting Faulty bit DM RAID: Record and handle missing devices DM RAID: Set recovery flags on resume md/raid5: Allow reshape while a bitmap is present. md/raid10: resize bitmap when required during reshape. md: allow array to be resized while bitmap is present. md/bitmap: make sure reshape request are reflected in superblock. md/bitmap: add bitmap_resize function to allow bitmap resizing. md/bitmap: use DIV_ROUND_UP instead of open-code md/bitmap: create a 'struct bitmap_counts' substructure of 'struct bitmap' md/bitmap: make bitmap bitops atomic. md/bitmap: make _page_attr bitops atomic. md/bitmap: merge bitmap_file_unmap and bitmap_file_put. md/bitmap: remove async freeing of bitmap file. md/bitmap: convert some spin_lock_irqsave to spin_lock_irq ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-05-23 20:08:40 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-05-23 20:08:40 -0400
commit: c80ddb526331a72c9e9d1480f85f6fd7c74e3d2d (patch)
tree: 0212803a009f171990032abb94fad84156baa153 /arch
parent: 2c13bc0f8f0d3e13b42be70bf74fec8e56b58324 (diff)
parent: 1dff2b87a34a1ac1d1898ea109bf97ed396aca53 (diff)
4 files changed, 229 insertions, 4 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index dc611a40a336..1f2521434554 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
 LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..454570891bdc 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
        .do_5 = xor_sse_5,
 };
+/* Also try the AVX routines */
+#include "xor_avx.h"
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
@@ -871,6 +874,7 @@ do {							\
        xor_speed(&xor_block_8regs_p);                  \
        xor_speed(&xor_block_32regs);                   \
        xor_speed(&xor_block_32regs_p);                 \
+        AVX_XOR_SPEED;                                  \
        if (cpu_has_xmm)                                \
                xor_speed(&xor_block_pIII_sse);         \
        if (cpu_has_mmx) {                              \
@@ -883,6 +887,6 @@ do {							\
   We may also be able to load into the L1 only depending on how the cpu
   deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST)                    \
-        (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+        AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..b9b2323e90fe 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
        .do_5 = xor_sse_5,
 };
+/* Also try the AVX routines */
+#include "xor_avx.h"
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                       \
 do {                                            \
+        AVX_XOR_SPEED;                          \
        xor_speed(&xor_block_sse);              \
 } while (0)
 /* We force the use of the SSE xor block because it can write around L2.
   We may also be able to load into the L1 only depending on how the cpu
   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+        AVX_SELECT(&xor_block_sse)
 #endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..2510d35f480e
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifdef CONFIG_AS_AVX
+#include <linux/compiler.h>
+#include <asm/i387.h>
+#define ALIGN32 __aligned(32)
+#define YMM_SAVED_REGS 4
+#define YMMS_SAVE \
+do { \
+        preempt_disable(); \
+        cr0 = read_cr0(); \
+        clts(); \
+        asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+        asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+        asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+        asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+#define YMMS_RESTORE \
+do { \
+        asm volatile("sfence" : : : "memory"); \
+        asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+        asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+        asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+        asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+        write_cr0(cr0); \
+        preempt_enable(); \
+} while (0);
+#define BLOCK4(i) \
+                BLOCK(32 * i, 0) \
+                BLOCK(32 * (i + 1), 1) \
+                BLOCK(32 * (i + 2), 2) \
+                BLOCK(32 * (i + 3), 3)
+#define BLOCK16() \
+                BLOCK4(0) \
+                BLOCK4(4) \
+                BLOCK4(8) \
+                BLOCK4(12)
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16();
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+        }
+        YMMS_RESTORE
+}
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+        unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+        unsigned long cr0, lines = bytes >> 9;
+        char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+        YMMS_SAVE
+        while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p3[i / sizeof(*p3)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p2[i / sizeof(*p2)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p1[i / sizeof(*p1)])); \
+        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+                "m" (p0[i / sizeof(*p0)])); \
+        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+                "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+                BLOCK16()
+                p0 = (unsigned long *)((uintptr_t)p0 + 512);
+                p1 = (unsigned long *)((uintptr_t)p1 + 512);
+                p2 = (unsigned long *)((uintptr_t)p2 + 512);
+                p3 = (unsigned long *)((uintptr_t)p3 + 512);
+                p4 = (unsigned long *)((uintptr_t)p4 + 512);
+        }
+        YMMS_RESTORE
+}
+static struct xor_block_template xor_block_avx = {
+        .name = "avx",
+        .do_2 = xor_avx_2,
+        .do_3 = xor_avx_3,
+        .do_4 = xor_avx_4,
+        .do_5 = xor_avx_5,
+};
+#define AVX_XOR_SPEED \
+do { \
+        if (cpu_has_avx) \
+                xor_speed(&xor_block_avx); \
+} while (0)
+#define AVX_SELECT(FASTEST) \
+        (cpu_has_avx ? &xor_block_avx : FASTEST)
+#else
+#define AVX_XOR_SPEED {}
+#define AVX_SELECT(FASTEST) (FASTEST)
+#endif
+#endif
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-05-23 20:08:40 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-05-23 20:08:40 -0400
commit	c80ddb526331a72c9e9d1480f85f6fd7c74e3d2d (patch)
tree	0212803a009f171990032abb94fad84156baa153 /arch
parent	2c13bc0f8f0d3e13b42be70bf74fec8e56b58324 (diff)
parent	1dff2b87a34a1ac1d1898ea109bf97ed396aca53 (diff)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile index dc611a40a336..1f2521434554 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
115		115
116	# does binutils support specific instructions?	116	# does binutils support specific instructions?
117	asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)	117	asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
		118	avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
118		119
119	KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)	120	KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
120	KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)	121	KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
121		122
122	LDFLAGS := -m elf_$(UTS_MACHINE)	123	LDFLAGS := -m elf_$(UTS_MACHINE)
123		124


diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
861	.do_5 = xor_sse_5,	861	.do_5 = xor_sse_5,
862	};	862	};
863		863
		864	/* Also try the AVX routines */
		865	#include "xor_avx.h"
		866
864	/* Also try the generic routines. */	867	/* Also try the generic routines. */
865	#include <asm-generic/xor.h>	868	#include <asm-generic/xor.h>
866		869
@@ -871,6 +874,7 @@ do { \
871	xor_speed(&xor_block_8regs_p); \	874	xor_speed(&xor_block_8regs_p); \
872	xor_speed(&xor_block_32regs); \	875	xor_speed(&xor_block_32regs); \
873	xor_speed(&xor_block_32regs_p); \	876	xor_speed(&xor_block_32regs_p); \
		877	AVX_XOR_SPEED; \
874	if (cpu_has_xmm) \	878	if (cpu_has_xmm) \
875	xor_speed(&xor_block_pIII_sse); \	879	xor_speed(&xor_block_pIII_sse); \
876	if (cpu_has_mmx) { \	880	if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
883	We may also be able to load into the L1 only depending on how the cpu	887	We may also be able to load into the L1 only depending on how the cpu
884	deals with a load to a line that is being prefetched. */	888	deals with a load to a line that is being prefetched. */
885	#define XOR_SELECT_TEMPLATE(FASTEST) \	889	#define XOR_SELECT_TEMPLATE(FASTEST) \
886	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)	890	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887		891
888	#endif /* _ASM_X86_XOR_32_H */	892	#endif /* _ASM_X86_XOR_32_H */


diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
347	.do_5 = xor_sse_5,	347	.do_5 = xor_sse_5,
348	};	348	};
349		349
		350
		351	/* Also try the AVX routines */
		352	#include "xor_avx.h"
		353
350	#undef XOR_TRY_TEMPLATES	354	#undef XOR_TRY_TEMPLATES
351	#define XOR_TRY_TEMPLATES \	355	#define XOR_TRY_TEMPLATES \
352	do { \	356	do { \
		357	AVX_XOR_SPEED; \
353	xor_speed(&xor_block_sse); \	358	xor_speed(&xor_block_sse); \
354	} while (0)	359	} while (0)
355		360
356	/* We force the use of the SSE xor block because it can write around L2.	361	/* We force the use of the SSE xor block because it can write around L2.
357	We may also be able to load into the L1 only depending on how the cpu	362	We may also be able to load into the L1 only depending on how the cpu
358	deals with a load to a line that is being prefetched. */	363	deals with a load to a line that is being prefetched. */
359	#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)	364	#define XOR_SELECT_TEMPLATE(FASTEST) \
		365	AVX_SELECT(&xor_block_sse)
360		366
361	#endif /* _ASM_X86_XOR_64_H */	367	#endif /* _ASM_X86_XOR_64_H */


diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
		1	#ifndef _ASM_X86_XOR_AVX_H
		2	#define _ASM_X86_XOR_AVX_H
		3
		4	/*
		5	* Optimized RAID-5 checksumming functions for AVX
		6	*
		7	* Copyright (C) 2012 Intel Corporation
		8	* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
		9	*
		10	* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
		11	*
		12	* This program is free software; you can redistribute it and/or
		13	* modify it under the terms of the GNU General Public License
		14	* as published by the Free Software Foundation; version 2
		15	* of the License.
		16	*/
		17
		18	#ifdef CONFIG_AS_AVX
		19
		20	#include <linux/compiler.h>
		21	#include <asm/i387.h>
		22
		23	#define ALIGN32 __aligned(32)
		24
		25	#define YMM_SAVED_REGS 4
		26
		27	#define YMMS_SAVE \
		28	do { \
		29	preempt_disable(); \
		30	cr0 = read_cr0(); \
		31	clts(); \
		32	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
		33	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
		34	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
		35	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
		36	} while (0);
		37
		38	#define YMMS_RESTORE \
		39	do { \
		40	asm volatile("sfence" : : : "memory"); \
		41	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
		42	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
		43	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
		44	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
		45	write_cr0(cr0); \
		46	preempt_enable(); \
		47	} while (0);
		48
		49	#define BLOCK4(i) \
		50	BLOCK(32 * i, 0) \
		51	BLOCK(32 * (i + 1), 1) \
		52	BLOCK(32 * (i + 2), 2) \
		53	BLOCK(32 * (i + 3), 3)
		54
		55	#define BLOCK16() \
		56	BLOCK4(0) \
		57	BLOCK4(4) \
		58	BLOCK4(8) \
		59	BLOCK4(12)
		60
		61	static void xor_avx_2(unsigned long bytes, unsigned long p0, unsigned long p1)
		62	{
		63	unsigned long cr0, lines = bytes >> 9;
		64	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		65
		66	YMMS_SAVE
		67
		68	while (lines--) {
		69	#undef BLOCK
		70	#define BLOCK(i, reg) \
		71	do { \
		72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
		73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		74	"m" (p0[i / sizeof(*p0)])); \
		75	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		76	"=m" (p0[i / sizeof(*p0)])); \
		77	} while (0);
		78
		79	BLOCK16()
		80
		81	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		82	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		83	}
		84
		85	YMMS_RESTORE
		86	}
		87
		88	static void xor_avx_3(unsigned long bytes, unsigned long p0, unsigned long p1,
		89	unsigned long *p2)
		90	{
		91	unsigned long cr0, lines = bytes >> 9;
		92	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		93
		94	YMMS_SAVE
		95
		96	while (lines--) {
		97	#undef BLOCK
		98	#define BLOCK(i, reg) \
		99	do { \
		100	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
		101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		102	"m" (p1[i / sizeof(*p1)])); \
		103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		104	"m" (p0[i / sizeof(*p0)])); \
		105	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		106	"=m" (p0[i / sizeof(*p0)])); \
		107	} while (0);
		108
		109	BLOCK16()
		110
		111	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		112	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		113	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		114	}
		115
		116	YMMS_RESTORE
		117	}
		118
		119	static void xor_avx_4(unsigned long bytes, unsigned long p0, unsigned long p1,
		120	unsigned long p2, unsigned long p3)
		121	{
		122	unsigned long cr0, lines = bytes >> 9;
		123	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		124
		125	YMMS_SAVE
		126
		127	while (lines--) {
		128	#undef BLOCK
		129	#define BLOCK(i, reg) \
		130	do { \
		131	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
		132	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		133	"m" (p2[i / sizeof(*p2)])); \
		134	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		135	"m" (p1[i / sizeof(*p1)])); \
		136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		137	"m" (p0[i / sizeof(*p0)])); \
		138	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		139	"=m" (p0[i / sizeof(*p0)])); \
		140	} while (0);
		141
		142	BLOCK16();
		143
		144	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		145	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		146	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		147	p3 = (unsigned long *)((uintptr_t)p3 + 512);
		148	}
		149
		150	YMMS_RESTORE
		151	}
		152
		153	static void xor_avx_5(unsigned long bytes, unsigned long p0, unsigned long p1,
		154	unsigned long p2, unsigned long p3, unsigned long *p4)
		155	{
		156	unsigned long cr0, lines = bytes >> 9;
		157	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
		158
		159	YMMS_SAVE
		160
		161	while (lines--) {
		162	#undef BLOCK
		163	#define BLOCK(i, reg) \
		164	do { \
		165	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
		166	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		167	"m" (p3[i / sizeof(*p3)])); \
		168	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		169	"m" (p2[i / sizeof(*p2)])); \
		170	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		171	"m" (p1[i / sizeof(*p1)])); \
		172	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		173	"m" (p0[i / sizeof(*p0)])); \
		174	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		175	"=m" (p0[i / sizeof(*p0)])); \
		176	} while (0);
		177
		178	BLOCK16()
		179
		180	p0 = (unsigned long *)((uintptr_t)p0 + 512);
		181	p1 = (unsigned long *)((uintptr_t)p1 + 512);
		182	p2 = (unsigned long *)((uintptr_t)p2 + 512);
		183	p3 = (unsigned long *)((uintptr_t)p3 + 512);
		184	p4 = (unsigned long *)((uintptr_t)p4 + 512);
		185	}
		186
		187	YMMS_RESTORE
		188	}
		189
		190	static struct xor_block_template xor_block_avx = {
		191	.name = "avx",
		192	.do_2 = xor_avx_2,
		193	.do_3 = xor_avx_3,
		194	.do_4 = xor_avx_4,
		195	.do_5 = xor_avx_5,
		196	};
		197
		198	#define AVX_XOR_SPEED \
		199	do { \
		200	if (cpu_has_avx) \
		201	xor_speed(&xor_block_avx); \
		202	} while (0)
		203
		204	#define AVX_SELECT(FASTEST) \
		205	(cpu_has_avx ? &xor_block_avx : FASTEST)
		206
		207	#else
		208
		209	#define AVX_XOR_SPEED {}
		210
		211	#define AVX_SELECT(FASTEST) (FASTEST)
		212
		213	#endif
		214	#endif