RAID/s390: add SIMD implementation for raid6 gen/xor

Using vector registers is slightly faster: raid6: vx128x8 gen() 19705 MB/s raid6: vx128x8 xor() 11886 MB/s raid6: using algorithm vx128x8 gen() 19705 MB/s raid6: .... xor() 11886 MB/s, rmw enabled vs the software algorithms: raid6: int64x1 gen() 3018 MB/s raid6: int64x1 xor() 1429 MB/s raid6: int64x2 gen() 4661 MB/s raid6: int64x2 xor() 3143 MB/s raid6: int64x4 gen() 5392 MB/s raid6: int64x4 xor() 3509 MB/s raid6: int64x8 gen() 4441 MB/s raid6: int64x8 xor() 3207 MB/s raid6: using algorithm int64x4 gen() 5392 MB/s raid6: .... xor() 3509 MB/s, rmw enabled Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
author: Martin Schwidefsky <schwidefsky@de.ibm.com> 2016-08-23 07:30:24 -0400
committer: Martin Schwidefsky <schwidefsky@de.ibm.com> 2016-08-29 05:05:04 -0400
commit: 474fd6e80fe529e9adeeb7ea9d4e5d6c4da0b7fe (patch)
tree: a5294d90e6e7205f26c4271cee49199be2a8addb
parent: 8f149ea6e91534c3e0b4cfcd843323bd94273087 (diff)
6 files changed, 265 insertions, 0 deletions
diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h
index b61846dff70f..49c24a2afce0 100644
--- a/arch/s390/include/asm/vx-insn.h
+++ b/arch/s390/include/asm/vx-insn.h
@@ -278,6 +278,15 @@
        VLVG    \v, \gr, \index, 3
 .endm
+/* VECTOR LOAD REGISTER */
+.macro  VLR     v1, v2
+        VX_NUM  v1, \v1
+        VX_NUM  v2, \v2
+        .word   0xE700 | ((v1&15) << 4) | (v2&15)
+        .word   0
+        MRXBOPC 0, 0x56, v1, v2
+.endm
 /* VECTOR LOAD */
 .macro  VL      v, disp, index="%r0", base
        VX_NUM  v1, \v
@@ -404,6 +413,16 @@
 /* Vector integer instructions */
+/* VECTOR AND */
+.macro  VN      vr1, vr2, vr3
+        VX_NUM  v1, \vr1
+        VX_NUM  v2, \vr2
+        VX_NUM  v3, \vr3
+        .word   0xE700 | ((v1&15) << 4) | (v2&15)
+        .word   ((v3&15) << 12)
+        MRXBOPC 0, 0x68, v1, v2, v3
+.endm
 /* VECTOR EXCLUSIVE OR */
 .macro  VX      vr1, vr2, vr3
        VX_NUM  v1, \vr1
@@ -469,6 +488,73 @@
        MRXBOPC 0, 0x7D, v1, v2, v3
 .endm
+/* VECTOR REPLICATE IMMEDIATE */
+.macro  VREPI   vr1, imm2, m3
+        VX_NUM  v1, \vr1
+        .word   0xE700 | ((v1&15) << 4)
+        .word   \imm2
+        MRXBOPC \m3, 0x45, v1
+.endm
+.macro  VREPIB  vr1, imm2
+        VREPI   \vr1, \imm2, 0
+.endm
+.macro  VREPIH  vr1, imm2
+        VREPI   \vr1, \imm2, 1
+.endm
+.macro  VREPIF  vr1, imm2
+        VREPI   \vr1, \imm2, 2
+.endm
+.macro  VREPIG  vr1, imm2
+        VREP    \vr1, \imm2, 3
+.endm
+/* VECTOR ADD */
+.macro  VA      vr1, vr2, vr3, m4
+        VX_NUM  v1, \vr1
+        VX_NUM  v2, \vr2
+        VX_NUM  v3, \vr3
+        .word   0xE700 | ((v1&15) << 4) | (v2&15)
+        .word   ((v3&15) << 12)
+        MRXBOPC \m4, 0xF3, v1, v2, v3
+.endm
+.macro  VAB     vr1, vr2, vr3
+        VA      \vr1, \vr2, \vr3, 0
+.endm
+.macro  VAH     vr1, vr2, vr3
+        VA      \vr1, \vr2, \vr3, 1
+.endm
+.macro  VAF     vr1, vr2, vr3
+        VA      \vr1, \vr2, \vr3, 2
+.endm
+.macro  VAG     vr1, vr2, vr3
+        VA      \vr1, \vr2, \vr3, 3
+.endm
+.macro  VAQ     vr1, vr2, vr3
+        VA      \vr1, \vr2, \vr3, 4
+.endm
+/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
+.macro  VESRAV  vr1, vr2, vr3, m4
+        VX_NUM  v1, \vr1
+        VX_NUM  v2, \vr2
+        VX_NUM  v3, \vr3
+        .word   0xE700 | ((v1&15) << 4) | (v2&15)
+        .word   ((v3&15) << 12)
+        MRXBOPC \m4, 0x7A, v1, v2, v3
+.endm
+.macro  VESRAVB vr1, vr2, vr3
+        VESRAV  \vr1, \vr2, \vr3, 0
+.endm
+.macro  VESRAVH vr1, vr2, vr3
+        VESRAV  \vr1, \vr2, \vr3, 1
+.endm
+.macro  VESRAVF vr1, vr2, vr3
+        VESRAV  \vr1, \vr2, \vr3, 2
+.endm
+.macro  VESRAVG vr1, vr2, vr3
+        VESRAV  \vr1, \vr2, \vr3, 3
+.endm
 #endif  /* __ASSEMBLY__ */
 #endif  /* __ASM_S390_VX_INSN_H */
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index a0118d5929a9..c032a6a408a6 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
 extern const struct raid6_calls raid6_avx2x2;
 extern const struct raid6_calls raid6_avx2x4;
 extern const struct raid6_calls raid6_tilegx8;
+extern const struct raid6_calls raid6_s390vx8;
 struct raid6_recov_calls {
        void (*data2)(int, size_t, int, int, void **);
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore
index 0a7e494b2bcd..f01b1cb04f91 100644
--- a/lib/raid6/.gitignore
+++ b/lib/raid6/.gitignore
@@ -3,3 +3,4 @@ altivec*.c
 int*.c
 tables.c
 neon?.c
+s390vx?.c
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48fa040..667b9607f8c0 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+raid6_pq-$(CONFIG_S390) += s390vx8.o
 hostprogs-y     += mktables
@@ -116,6 +117,11 @@ $(obj)/tilegx8.c:   UNROLL := 8
 $(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
        $(call if_changed,unroll)
+targets += s390vx8.c
+$(obj)/s390vx8.c:   UNROLL := 8
+$(obj)/s390vx8.c:   $(src)/s390vx.uc $(src)/unroll.awk FORCE
+        $(call if_changed,unroll)
 quiet_cmd_mktable = TABLE   $@
      cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..e1923b602bbc 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -69,6 +69,9 @@ const struct raid6_calls * const raid6_algos[] = {
 #if defined(CONFIG_TILEGX)
        &raid6_tilegx8,
 #endif
+#if defined(CONFIG_S390)
+        &raid6_s390vx8,
+#endif
        &raid6_intx1,
        &raid6_intx2,
        &raid6_intx4,
diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc
new file mode 100644
index 000000000000..7b45191a655f
--- /dev/null
+++ b/lib/raid6/s390vx.uc
@@ -0,0 +1,168 @@
+/*
+ * raid6_vx$#.c
+ *
+ * $#-way unrolled RAID6 gen/xor functions for s390
+ * based on the vector facility
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *
+ * This file is postprocessed using unroll.awk.
+ */
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+asm(".include \"asm/vx-insn.h\"\n");
+#define NSIZE 16
+static inline void LOAD_CONST(void)
+{
+        asm volatile("VREPIB %v24,7");
+        asm volatile("VREPIB %v25,0x1d");
+}
+/*
+ * The SHLBYTE() operation shifts each of the 16 bytes in
+ * vector register y left by 1 bit and stores the result in
+ * vector register x.
+ */
+static inline void SHLBYTE(int x, int y)
+{
+        asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
+}
+/*
+ * For each of the 16 bytes in the vector register y the MASK()
+ * operation returns 0xFF if the high bit of the byte is 1,
+ * or 0x00 if the high bit is 0. The result is stored in vector
+ * register x.
+ */
+static inline void MASK(int x, int y)
+{
+        asm volatile ("VESRAVB  %0,%1,24" : : "i" (x), "i" (y));
+}
+static inline void AND(int x, int y, int z)
+{
+        asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+static inline void XOR(int x, int y, int z)
+{
+        asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+static inline void LOAD_DATA(int x, int n, u8 *ptr)
+{
+        typedef struct { u8 _[16*n]; } addrtype;
+        register addrtype *__ptr asm("1") = (addrtype *) ptr;
+        asm volatile ("VLM %2,%3,0,%r1"
+                      : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+static inline void STORE_DATA(int x, int n, u8 *ptr)
+{
+        typedef struct { u8 _[16*n]; } addrtype;
+        register addrtype *__ptr asm("1") = (addrtype *) ptr;
+        asm volatile ("VSTM %2,%3,0,1"
+                      : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+static inline void COPY_VEC(int x, int y)
+{
+        asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
+}
+static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+        struct kernel_fpu vxstate;
+        u8 **dptr, *p, *q;
+        int d, z, z0;
+        kernel_fpu_begin(&vxstate, KERNEL_VXR);
+        LOAD_CONST();
+        dptr = (u8 **) ptrs;
+        z0 = disks - 3;         /* Highest data disk */
+        p = dptr[z0 + 1];       /* XOR parity */
+        q = dptr[z0 + 2];       /* RS syndrome */
+        for (d = 0; d < bytes; d += $#*NSIZE) {
+                LOAD_DATA(0,$#,&dptr[z0][d]);
+                COPY_VEC(8+$$,0+$$);
+                for (z = z0 - 1; z >= 0; z--) {
+                        MASK(16+$$,8+$$);
+                        AND(16+$$,16+$$,25);
+                        SHLBYTE(8+$$,8+$$);
+                        XOR(8+$$,8+$$,16+$$);
+                        LOAD_DATA(16,$#,&dptr[z][d]);
+                        XOR(0+$$,0+$$,16+$$);
+                        XOR(8+$$,8+$$,16+$$);
+                }
+                STORE_DATA(0,$#,&p[d]);
+                STORE_DATA(8,$#,&q[d]);
+        }
+        kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
+                                        size_t bytes, void **ptrs)
+{
+        struct kernel_fpu vxstate;
+        u8 **dptr, *p, *q;
+        int d, z, z0;
+        dptr = (u8 **) ptrs;
+        z0 = stop;              /* P/Q right side optimization */
+        p = dptr[disks - 2];    /* XOR parity */
+        q = dptr[disks - 1];    /* RS syndrome */
+        kernel_fpu_begin(&vxstate, KERNEL_VXR);
+        LOAD_CONST();
+        for (d = 0; d < bytes; d += $#*NSIZE) {
+                /* P/Q data pages */
+                LOAD_DATA(0,$#,&dptr[z0][d]);
+                COPY_VEC(8+$$,0+$$);
+                for (z = z0 - 1; z >= start; z--) {
+                        MASK(16+$$,8+$$);
+                        AND(16+$$,16+$$,25);
+                        SHLBYTE(8+$$,8+$$);
+                        XOR(8+$$,8+$$,16+$$);
+                        LOAD_DATA(16,$#,&dptr[z][d]);
+                        XOR(0+$$,0+$$,16+$$);
+                        XOR(8+$$,8+$$,16+$$);
+                }
+                /* P/Q left side optimization */
+                for (z = start - 1; z >= 0; z--) {
+                        MASK(16+$$,8+$$);
+                        AND(16+$$,16+$$,25);
+                        SHLBYTE(8+$$,8+$$);
+                        XOR(8+$$,8+$$,16+$$);
+                }
+                LOAD_DATA(16,$#,&p[d]);
+                XOR(16+$$,16+$$,0+$$);
+                STORE_DATA(16,$#,&p[d]);
+                LOAD_DATA(16,$#,&q[d]);
+                XOR(16+$$,16+$$,8+$$);
+                STORE_DATA(16,$#,&q[d]);
+        }
+        kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+static int raid6_s390vx$#_valid(void)
+{
+        return MACHINE_HAS_VX;
+}
+const struct raid6_calls raid6_s390vx$# = {
+        raid6_s390vx$#_gen_syndrome,
+        raid6_s390vx$#_xor_syndrome,
+        raid6_s390vx$#_valid,
+        "vx128x$#",
+        1
+};
author	Martin Schwidefsky <schwidefsky@de.ibm.com>	2016-08-23 07:30:24 -0400
committer	Martin Schwidefsky <schwidefsky@de.ibm.com>	2016-08-29 05:05:04 -0400
commit	474fd6e80fe529e9adeeb7ea9d4e5d6c4da0b7fe (patch)
tree	a5294d90e6e7205f26c4271cee49199be2a8addb
parent	8f149ea6e91534c3e0b4cfcd843323bd94273087 (diff)

diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h index b61846dff70f..49c24a2afce0 100644 --- a/arch/s390/include/asm/vx-insn.h +++ b/arch/s390/include/asm/vx-insn.h
@@ -278,6 +278,15 @@
278	VLVG \v, \gr, \index, 3	278	VLVG \v, \gr, \index, 3
279	.endm	279	.endm
280		280
		281	/* VECTOR LOAD REGISTER */
		282	.macro VLR v1, v2
		283	VX_NUM v1, \v1
		284	VX_NUM v2, \v2
		285	.word 0xE700 \| ((v1&15) << 4) \| (v2&15)
		286	.word 0
		287	MRXBOPC 0, 0x56, v1, v2
		288	.endm
		289
281	/* VECTOR LOAD */	290	/* VECTOR LOAD */
282	.macro VL v, disp, index="%r0", base	291	.macro VL v, disp, index="%r0", base
283	VX_NUM v1, \v	292	VX_NUM v1, \v
@@ -404,6 +413,16 @@
404		413
405	/* Vector integer instructions */	414	/* Vector integer instructions */
406		415
		416	/* VECTOR AND */
		417	.macro VN vr1, vr2, vr3
		418	VX_NUM v1, \vr1
		419	VX_NUM v2, \vr2
		420	VX_NUM v3, \vr3
		421	.word 0xE700 \| ((v1&15) << 4) \| (v2&15)
		422	.word ((v3&15) << 12)
		423	MRXBOPC 0, 0x68, v1, v2, v3
		424	.endm
		425
407	/* VECTOR EXCLUSIVE OR */	426	/* VECTOR EXCLUSIVE OR */
408	.macro VX vr1, vr2, vr3	427	.macro VX vr1, vr2, vr3
409	VX_NUM v1, \vr1	428	VX_NUM v1, \vr1
@@ -469,6 +488,73 @@
469	MRXBOPC 0, 0x7D, v1, v2, v3	488	MRXBOPC 0, 0x7D, v1, v2, v3
470	.endm	489	.endm
471		490
		491	/* VECTOR REPLICATE IMMEDIATE */
		492	.macro VREPI vr1, imm2, m3
		493	VX_NUM v1, \vr1
		494	.word 0xE700 \| ((v1&15) << 4)
		495	.word \imm2
		496	MRXBOPC \m3, 0x45, v1
		497	.endm
		498	.macro VREPIB vr1, imm2
		499	VREPI \vr1, \imm2, 0
		500	.endm
		501	.macro VREPIH vr1, imm2
		502	VREPI \vr1, \imm2, 1
		503	.endm
		504	.macro VREPIF vr1, imm2
		505	VREPI \vr1, \imm2, 2
		506	.endm
		507	.macro VREPIG vr1, imm2
		508	VREP \vr1, \imm2, 3
		509	.endm
		510
		511	/* VECTOR ADD */
		512	.macro VA vr1, vr2, vr3, m4
		513	VX_NUM v1, \vr1
		514	VX_NUM v2, \vr2
		515	VX_NUM v3, \vr3
		516	.word 0xE700 \| ((v1&15) << 4) \| (v2&15)
		517	.word ((v3&15) << 12)
		518	MRXBOPC \m4, 0xF3, v1, v2, v3
		519	.endm
		520	.macro VAB vr1, vr2, vr3
		521	VA \vr1, \vr2, \vr3, 0
		522	.endm
		523	.macro VAH vr1, vr2, vr3
		524	VA \vr1, \vr2, \vr3, 1
		525	.endm
		526	.macro VAF vr1, vr2, vr3
		527	VA \vr1, \vr2, \vr3, 2
		528	.endm
		529	.macro VAG vr1, vr2, vr3
		530	VA \vr1, \vr2, \vr3, 3
		531	.endm
		532	.macro VAQ vr1, vr2, vr3
		533	VA \vr1, \vr2, \vr3, 4
		534	.endm
		535
		536	/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */
		537	.macro VESRAV vr1, vr2, vr3, m4
		538	VX_NUM v1, \vr1
		539	VX_NUM v2, \vr2
		540	VX_NUM v3, \vr3
		541	.word 0xE700 \| ((v1&15) << 4) \| (v2&15)
		542	.word ((v3&15) << 12)
		543	MRXBOPC \m4, 0x7A, v1, v2, v3
		544	.endm
		545
		546	.macro VESRAVB vr1, vr2, vr3
		547	VESRAV \vr1, \vr2, \vr3, 0
		548	.endm
		549	.macro VESRAVH vr1, vr2, vr3
		550	VESRAV \vr1, \vr2, \vr3, 1
		551	.endm
		552	.macro VESRAVF vr1, vr2, vr3
		553	VESRAV \vr1, \vr2, \vr3, 2
		554	.endm
		555	.macro VESRAVG vr1, vr2, vr3
		556	VESRAV \vr1, \vr2, \vr3, 3
		557	.endm
472		558
473	#endif /* __ASSEMBLY__ */	559	#endif /* __ASSEMBLY__ */
474	#endif /* __ASM_S390_VX_INSN_H */	560	#endif /* __ASM_S390_VX_INSN_H */


diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index a0118d5929a9..c032a6a408a6 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h
@@ -103,6 +103,7 @@ extern const struct raid6_calls raid6_avx2x1;
103	extern const struct raid6_calls raid6_avx2x2;	103	extern const struct raid6_calls raid6_avx2x2;
104	extern const struct raid6_calls raid6_avx2x4;	104	extern const struct raid6_calls raid6_avx2x4;
105	extern const struct raid6_calls raid6_tilegx8;	105	extern const struct raid6_calls raid6_tilegx8;
		106	extern const struct raid6_calls raid6_s390vx8;
106		107
107	struct raid6_recov_calls {	108	struct raid6_recov_calls {
108	void (data2)(int, size_t, int, int, void *);	109	void (data2)(int, size_t, int, int, void *);


diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 0a7e494b2bcd..f01b1cb04f91 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore
@@ -3,3 +3,4 @@ altivec*.c
3	int*.c	3	int*.c
4	tables.c	4	tables.c
5	neon?.c	5	neon?.c
		6	s390vx?.c


diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3b10a48fa040..667b9607f8c0 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile
@@ -7,6 +7,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
7	raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o	7	raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
8	raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o	8	raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
9	raid6_pq-$(CONFIG_TILEGX) += tilegx8.o	9	raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
		10	raid6_pq-$(CONFIG_S390) += s390vx8.o
10		11
11	hostprogs-y += mktables	12	hostprogs-y += mktables
12		13
@@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8
116	$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE	117	$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE
117	$(call if_changed,unroll)	118	$(call if_changed,unroll)
118		119
		120	targets += s390vx8.c
		121	$(obj)/s390vx8.c: UNROLL := 8
		122	$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
		123	$(call if_changed,unroll)
		124
119	quiet_cmd_mktable = TABLE $@	125	quiet_cmd_mktable = TABLE $@
120	cmd_mktable = $(obj)/mktables > $@ \|\| ( rm -f $@ && exit 1 )	126	cmd_mktable = $(obj)/mktables > $@ \|\| ( rm -f $@ && exit 1 )
121		127


diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 975c6e0434bd..e1923b602bbc 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c
@@ -69,6 +69,9 @@ const struct raid6_calls * const raid6_algos[] = {
69	#if defined(CONFIG_TILEGX)	69	#if defined(CONFIG_TILEGX)
70	&raid6_tilegx8,	70	&raid6_tilegx8,
71	#endif	71	#endif
		72	#if defined(CONFIG_S390)
		73	&raid6_s390vx8,
		74	#endif
72	&raid6_intx1,	75	&raid6_intx1,
73	&raid6_intx2,	76	&raid6_intx2,
74	&raid6_intx4,	77	&raid6_intx4,


diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc new file mode 100644 index 000000000000..7b45191a655f --- /dev/null +++ b/lib/raid6/s390vx.uc
@@ -0,0 +1,168 @@
		1	/*
		2	* raid6_vx$#.c
		3	*
		4	* $#-way unrolled RAID6 gen/xor functions for s390
		5	* based on the vector facility
		6	*
		7	* Copyright IBM Corp. 2016
		8	* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
		9	*
		10	* This file is postprocessed using unroll.awk.
		11	*/
		12
		13	#include <linux/raid/pq.h>
		14	#include <asm/fpu/api.h>
		15
		16	asm(".include \"asm/vx-insn.h\"\n");
		17
		18	#define NSIZE 16
		19
		20	static inline void LOAD_CONST(void)
		21	{
		22	asm volatile("VREPIB %v24,7");
		23	asm volatile("VREPIB %v25,0x1d");
		24	}
		25
		26	/*
		27	* The SHLBYTE() operation shifts each of the 16 bytes in
		28	* vector register y left by 1 bit and stores the result in
		29	* vector register x.
		30	*/
		31	static inline void SHLBYTE(int x, int y)
		32	{
		33	asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
		34	}
		35
		36	/*
		37	* For each of the 16 bytes in the vector register y the MASK()
		38	* operation returns 0xFF if the high bit of the byte is 1,
		39	* or 0x00 if the high bit is 0. The result is stored in vector
		40	* register x.
		41	*/
		42	static inline void MASK(int x, int y)
		43	{
		44	asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
		45	}
		46
		47	static inline void AND(int x, int y, int z)
		48	{
		49	asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
		50	}
		51
		52	static inline void XOR(int x, int y, int z)
		53	{
		54	asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
		55	}
		56
		57	static inline void LOAD_DATA(int x, int n, u8 *ptr)
		58	{
		59	typedef struct { u8 _[16*n]; } addrtype;
		60	register addrtype __ptr asm("1") = (addrtype ) ptr;
		61
		62	asm volatile ("VLM %2,%3,0,%r1"
		63	: : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
		64	}
		65
		66	static inline void STORE_DATA(int x, int n, u8 *ptr)
		67	{
		68	typedef struct { u8 _[16*n]; } addrtype;
		69	register addrtype __ptr asm("1") = (addrtype ) ptr;
		70
		71	asm volatile ("VSTM %2,%3,0,1"
		72	: "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
		73	}
		74
		75	static inline void COPY_VEC(int x, int y)
		76	{
		77	asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
		78	}
		79
		80	static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
		81	{
		82	struct kernel_fpu vxstate;
		83	u8 *dptr, p, *q;
		84	int d, z, z0;
		85
		86	kernel_fpu_begin(&vxstate, KERNEL_VXR);
		87	LOAD_CONST();
		88
		89	dptr = (u8 **) ptrs;
		90	z0 = disks - 3; /* Highest data disk */
		91	p = dptr[z0 + 1]; /* XOR parity */
		92	q = dptr[z0 + 2]; /* RS syndrome */
		93
		94	for (d = 0; d < bytes; d += $#*NSIZE) {
		95	LOAD_DATA(0,$#,&dptr[z0][d]);
		96	COPY_VEC(8+$$,0+$$);
		97	for (z = z0 - 1; z >= 0; z--) {
		98	MASK(16+$$,8+$$);
		99	AND(16+$$,16+$$,25);
		100	SHLBYTE(8+$$,8+$$);
		101	XOR(8+$$,8+$$,16+$$);
		102	LOAD_DATA(16,$#,&dptr[z][d]);
		103	XOR(0+$$,0+$$,16+$$);
		104	XOR(8+$$,8+$$,16+$$);
		105	}
		106	STORE_DATA(0,$#,&p[d]);
		107	STORE_DATA(8,$#,&q[d]);
		108	}
		109	kernel_fpu_end(&vxstate, KERNEL_VXR);
		110	}
		111
		112	static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
		113	size_t bytes, void **ptrs)
		114	{
		115	struct kernel_fpu vxstate;
		116	u8 *dptr, p, *q;
		117	int d, z, z0;
		118
		119	dptr = (u8 **) ptrs;
		120	z0 = stop; /* P/Q right side optimization */
		121	p = dptr[disks - 2]; /* XOR parity */
		122	q = dptr[disks - 1]; /* RS syndrome */
		123
		124	kernel_fpu_begin(&vxstate, KERNEL_VXR);
		125	LOAD_CONST();
		126
		127	for (d = 0; d < bytes; d += $#*NSIZE) {
		128	/* P/Q data pages */
		129	LOAD_DATA(0,$#,&dptr[z0][d]);
		130	COPY_VEC(8+$$,0+$$);
		131	for (z = z0 - 1; z >= start; z--) {
		132	MASK(16+$$,8+$$);
		133	AND(16+$$,16+$$,25);
		134	SHLBYTE(8+$$,8+$$);
		135	XOR(8+$$,8+$$,16+$$);
		136	LOAD_DATA(16,$#,&dptr[z][d]);
		137	XOR(0+$$,0+$$,16+$$);
		138	XOR(8+$$,8+$$,16+$$);
		139	}
		140	/* P/Q left side optimization */
		141	for (z = start - 1; z >= 0; z--) {
		142	MASK(16+$$,8+$$);
		143	AND(16+$$,16+$$,25);
		144	SHLBYTE(8+$$,8+$$);
		145	XOR(8+$$,8+$$,16+$$);
		146	}
		147	LOAD_DATA(16,$#,&p[d]);
		148	XOR(16+$$,16+$$,0+$$);
		149	STORE_DATA(16,$#,&p[d]);
		150	LOAD_DATA(16,$#,&q[d]);
		151	XOR(16+$$,16+$$,8+$$);
		152	STORE_DATA(16,$#,&q[d]);
		153	}
		154	kernel_fpu_end(&vxstate, KERNEL_VXR);
		155	}
		156
		157	static int raid6_s390vx$#_valid(void)
		158	{
		159	return MACHINE_HAS_VX;
		160	}
		161
		162	const struct raid6_calls raid6_s390vx$# = {
		163	raid6_s390vx$#_gen_syndrome,
		164	raid6_s390vx$#_xor_syndrome,
		165	raid6_s390vx$#_valid,
		166	"vx128x$#",
		167	1
		168	};