diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2013-05-16 11:20:32 -0400 |
---|---|---|
committer | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2013-07-08 17:09:18 -0400 |
commit | 7d11965ddb9b9b1e0a5d13c58345ada1ccbc663b (patch) | |
tree | 9aec7ff11372f0194f288c13c7c83ff6ef40c87b /lib | |
parent | 01956597cbc46df072f20f90a40eebe356200c38 (diff) |
lib/raid6: add ARM-NEON accelerated syndrome calculation
Rebased/reworked a patch contributed by Rob Herring that uses
NEON intrinsics to perform the RAID-6 syndrome calculations.
It uses the existing unroll.awk code to generate several
unrolled versions of which the best performing one is selected
at boot time.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Cc: hpa@linux.intel.com
Diffstat (limited to 'lib')
-rw-r--r-- | lib/raid6/.gitignore | 1 | ||||
-rw-r--r-- | lib/raid6/Makefile | 40 | ||||
-rw-r--r-- | lib/raid6/algos.c | 6 | ||||
-rw-r--r-- | lib/raid6/neon.c | 58 | ||||
-rw-r--r-- | lib/raid6/neon.uc | 80 | ||||
-rw-r--r-- | lib/raid6/test/Makefile | 26 |
6 files changed, 210 insertions, 1 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162becacf97c..0a7e494b2bcd 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore | |||
@@ -2,3 +2,4 @@ mktables | |||
2 | altivec*.c | 2 | altivec*.c |
3 | int*.c | 3 | int*.c |
4 | tables.c | 4 | tables.c |
5 | neon?.c | ||
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184725d7..b4625787c7ee 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile | |||
@@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ | |||
5 | 5 | ||
6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o | 6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o |
7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o | 7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o |
8 | raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o | ||
8 | 9 | ||
9 | hostprogs-y += mktables | 10 | hostprogs-y += mktables |
10 | 11 | ||
@@ -16,6 +17,21 @@ ifeq ($(CONFIG_ALTIVEC),y) | |||
16 | altivec_flags := -maltivec -mabi=altivec | 17 | altivec_flags := -maltivec -mabi=altivec |
17 | endif | 18 | endif |
18 | 19 | ||
20 | # The GCC option -ffreestanding is required in order to compile code containing | ||
21 | # ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) | ||
22 | ifeq ($(CONFIG_KERNEL_MODE_NEON),y) | ||
23 | NEON_FLAGS := -ffreestanding | ||
24 | ifeq ($(ARCH),arm) | ||
25 | NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon | ||
26 | endif | ||
27 | ifeq ($(ARCH),arm64) | ||
28 | CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only | ||
29 | CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only | ||
30 | CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only | ||
31 | CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only | ||
32 | endif | ||
33 | endif | ||
34 | |||
19 | targets += int1.c | 35 | targets += int1.c |
20 | $(obj)/int1.c: UNROLL := 1 | 36 | $(obj)/int1.c: UNROLL := 1 |
21 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE | 37 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE |
@@ -70,6 +86,30 @@ $(obj)/altivec8.c: UNROLL := 8 | |||
70 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE | 86 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE |
71 | $(call if_changed,unroll) | 87 | $(call if_changed,unroll) |
72 | 88 | ||
89 | CFLAGS_neon1.o += $(NEON_FLAGS) | ||
90 | targets += neon1.c | ||
91 | $(obj)/neon1.c: UNROLL := 1 | ||
92 | $(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
93 | $(call if_changed,unroll) | ||
94 | |||
95 | CFLAGS_neon2.o += $(NEON_FLAGS) | ||
96 | targets += neon2.c | ||
97 | $(obj)/neon2.c: UNROLL := 2 | ||
98 | $(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
99 | $(call if_changed,unroll) | ||
100 | |||
101 | CFLAGS_neon4.o += $(NEON_FLAGS) | ||
102 | targets += neon4.c | ||
103 | $(obj)/neon4.c: UNROLL := 4 | ||
104 | $(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
105 | $(call if_changed,unroll) | ||
106 | |||
107 | CFLAGS_neon8.o += $(NEON_FLAGS) | ||
108 | targets += neon8.c | ||
109 | $(obj)/neon8.c: UNROLL := 8 | ||
110 | $(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
111 | $(call if_changed,unroll) | ||
112 | |||
73 | quiet_cmd_mktable = TABLE $@ | 113 | quiet_cmd_mktable = TABLE $@ |
74 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) | 114 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) |
75 | 115 | ||
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316fe9f30..74e6f5629dbc 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
@@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = { | |||
70 | &raid6_intx2, | 70 | &raid6_intx2, |
71 | &raid6_intx4, | 71 | &raid6_intx4, |
72 | &raid6_intx8, | 72 | &raid6_intx8, |
73 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
74 | &raid6_neonx1, | ||
75 | &raid6_neonx2, | ||
76 | &raid6_neonx4, | ||
77 | &raid6_neonx8, | ||
78 | #endif | ||
73 | NULL | 79 | NULL |
74 | }; | 80 | }; |
75 | 81 | ||
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 000000000000..36ad4705df1a --- /dev/null +++ b/lib/raid6/neon.c | |||
@@ -0,0 +1,58 @@ | |||
1 | /* | ||
2 | * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/raid/pq.h> | ||
12 | |||
13 | #ifdef __KERNEL__ | ||
14 | #include <asm/neon.h> | ||
15 | #else | ||
16 | #define kernel_neon_begin() | ||
17 | #define kernel_neon_end() | ||
18 | #define cpu_has_neon() (1) | ||
19 | #endif | ||
20 | |||
21 | /* | ||
22 | * There are 2 reasons these wrappers are kept in a separate compilation unit | ||
23 | * from the actual implementations in neonN.c (generated from neon.uc by | ||
24 | * unroll.awk): | ||
25 | * - the actual implementations use NEON intrinsics, and the GCC support header | ||
26 | * (arm_neon.h) is not fully compatible (type wise) with the kernel; | ||
27 | * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, | ||
28 | * and we have to make sure that we never use *any* NEON/VFP instructions | ||
29 | * outside a kernel_neon_begin()/kernel_neon_end() pair. | ||
30 | */ | ||
31 | |||
32 | #define RAID6_NEON_WRAPPER(_n) \ | ||
33 | static void raid6_neon ## _n ## _gen_syndrome(int disks, \ | ||
34 | size_t bytes, void **ptrs) \ | ||
35 | { \ | ||
36 | void raid6_neon ## _n ## _gen_syndrome_real(int, \ | ||
37 | unsigned long, void**); \ | ||
38 | kernel_neon_begin(); \ | ||
39 | raid6_neon ## _n ## _gen_syndrome_real(disks, \ | ||
40 | (unsigned long)bytes, ptrs); \ | ||
41 | kernel_neon_end(); \ | ||
42 | } \ | ||
43 | struct raid6_calls const raid6_neonx ## _n = { \ | ||
44 | raid6_neon ## _n ## _gen_syndrome, \ | ||
45 | raid6_have_neon, \ | ||
46 | "neonx" #_n, \ | ||
47 | 0 \ | ||
48 | } | ||
49 | |||
50 | static int raid6_have_neon(void) | ||
51 | { | ||
52 | return cpu_has_neon(); | ||
53 | } | ||
54 | |||
55 | RAID6_NEON_WRAPPER(1); | ||
56 | RAID6_NEON_WRAPPER(2); | ||
57 | RAID6_NEON_WRAPPER(4); | ||
58 | RAID6_NEON_WRAPPER(8); | ||
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 000000000000..1b9ed793342d --- /dev/null +++ b/lib/raid6/neon.uc | |||
@@ -0,0 +1,80 @@ | |||
1 | /* ----------------------------------------------------------------------- | ||
2 | * | ||
3 | * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions | ||
4 | * | ||
5 | * Copyright (C) 2012 Rob Herring | ||
6 | * | ||
7 | * Based on altivec.uc: | ||
8 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
13 | * Boston MA 02111-1307, USA; either version 2 of the License, or | ||
14 | * (at your option) any later version; incorporated herein by reference. | ||
15 | * | ||
16 | * ----------------------------------------------------------------------- */ | ||
17 | |||
18 | /* | ||
19 | * neon$#.c | ||
20 | * | ||
21 | * $#-way unrolled NEON intrinsics math RAID-6 instruction set | ||
22 | * | ||
23 | * This file is postprocessed using unroll.awk | ||
24 | */ | ||
25 | |||
26 | #include <arm_neon.h> | ||
27 | |||
28 | typedef uint8x16_t unative_t; | ||
29 | |||
30 | #define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) | ||
31 | #define NSIZE sizeof(unative_t) | ||
32 | |||
33 | /* | ||
34 | * The SHLBYTE() operation shifts each byte left by 1, *not* | ||
35 | * rolling over into the next byte | ||
36 | */ | ||
37 | static inline unative_t SHLBYTE(unative_t v) | ||
38 | { | ||
39 | return vshlq_n_u8(v, 1); | ||
40 | } | ||
41 | |||
42 | /* | ||
43 | * The MASK() operation returns 0xFF in any byte for which the high | ||
44 | * bit is 1, 0x00 for any byte for which the high bit is 0. | ||
45 | */ | ||
46 | static inline unative_t MASK(unative_t v) | ||
47 | { | ||
48 | const uint8x16_t temp = NBYTES(0); | ||
49 | return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); | ||
50 | } | ||
51 | |||
52 | void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) | ||
53 | { | ||
54 | uint8_t **dptr = (uint8_t **)ptrs; | ||
55 | uint8_t *p, *q; | ||
56 | int d, z, z0; | ||
57 | |||
58 | register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
59 | const unative_t x1d = NBYTES(0x1d); | ||
60 | |||
61 | z0 = disks - 3; /* Highest data disk */ | ||
62 | p = dptr[z0+1]; /* XOR parity */ | ||
63 | q = dptr[z0+2]; /* RS syndrome */ | ||
64 | |||
65 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
66 | wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); | ||
67 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
68 | wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); | ||
69 | wp$$ = veorq_u8(wp$$, wd$$); | ||
70 | w2$$ = MASK(wq$$); | ||
71 | w1$$ = SHLBYTE(wq$$); | ||
72 | |||
73 | w2$$ = vandq_u8(w2$$, x1d); | ||
74 | w1$$ = veorq_u8(w1$$, w2$$); | ||
75 | wq$$ = veorq_u8(w1$$, wd$$); | ||
76 | } | ||
77 | vst1q_u8(&p[d+NSIZE*$$], wp$$); | ||
78 | vst1q_u8(&q[d+NSIZE*$$], wq$$); | ||
79 | } | ||
80 | } | ||
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332dbf8aa..28afa1a06e03 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile | |||
@@ -22,11 +22,23 @@ ifeq ($(ARCH),x86_64) | |||
22 | IS_X86 = yes | 22 | IS_X86 = yes |
23 | endif | 23 | endif |
24 | 24 | ||
25 | ifeq ($(ARCH),arm) | ||
26 | CFLAGS += -I../../../arch/arm/include -mfpu=neon | ||
27 | HAS_NEON = yes | ||
28 | endif | ||
29 | ifeq ($(ARCH),arm64) | ||
30 | CFLAGS += -I../../../arch/arm64/include | ||
31 | HAS_NEON = yes | ||
32 | endif | ||
33 | |||
25 | ifeq ($(IS_X86),yes) | 34 | ifeq ($(IS_X86),yes) |
26 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o | 35 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o |
27 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ | 36 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ |
28 | gcc -c -x assembler - >&/dev/null && \ | 37 | gcc -c -x assembler - >&/dev/null && \ |
29 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) | 38 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) |
39 | else ifeq ($(HAS_NEON),yes) | ||
40 | OBJS += neon.o neon1.o neon2.o neon4.o neon8.o | ||
41 | CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 | ||
30 | else | 42 | else |
31 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ | 43 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ |
32 | gcc -c -x c - >&/dev/null && \ | 44 | gcc -c -x c - >&/dev/null && \ |
@@ -55,6 +67,18 @@ raid6.a: $(OBJS) | |||
55 | raid6test: test.c raid6.a | 67 | raid6test: test.c raid6.a |
56 | $(CC) $(CFLAGS) -o raid6test $^ | 68 | $(CC) $(CFLAGS) -o raid6test $^ |
57 | 69 | ||
70 | neon1.c: neon.uc ../unroll.awk | ||
71 | $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ | ||
72 | |||
73 | neon2.c: neon.uc ../unroll.awk | ||
74 | $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ | ||
75 | |||
76 | neon4.c: neon.uc ../unroll.awk | ||
77 | $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ | ||
78 | |||
79 | neon8.c: neon.uc ../unroll.awk | ||
80 | $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ | ||
81 | |||
58 | altivec1.c: altivec.uc ../unroll.awk | 82 | altivec1.c: altivec.uc ../unroll.awk |
59 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ | 83 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ |
60 | 84 | ||
@@ -89,7 +113,7 @@ tables.c: mktables | |||
89 | ./mktables > tables.c | 113 | ./mktables > tables.c |
90 | 114 | ||
91 | clean: | 115 | clean: |
92 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test | 116 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test |
93 | 117 | ||
94 | spotless: clean | 118 | spotless: clean |
95 | rm -f *~ | 119 | rm -f *~ |