diff options
-rw-r--r-- | arch/arm/Kconfig | 7 | ||||
-rw-r--r-- | arch/arm/include/asm/neon.h | 36 | ||||
-rw-r--r-- | arch/arm/include/asm/xor.h | 73 | ||||
-rw-r--r-- | arch/arm/lib/Makefile | 6 | ||||
-rw-r--r-- | arch/arm/lib/xor-neon.c | 42 | ||||
-rw-r--r-- | arch/arm/vfp/vfphw.S | 5 | ||||
-rw-r--r-- | arch/arm/vfp/vfpmodule.c | 69 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 5 | ||||
-rw-r--r-- | lib/raid6/.gitignore | 1 | ||||
-rw-r--r-- | lib/raid6/Makefile | 40 | ||||
-rw-r--r-- | lib/raid6/algos.c | 6 | ||||
-rw-r--r-- | lib/raid6/neon.c | 58 | ||||
-rw-r--r-- | lib/raid6/neon.uc | 80 | ||||
-rw-r--r-- | lib/raid6/test/Makefile | 26 |
14 files changed, 452 insertions, 2 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ba412e02ec0c..ccc388d388be 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -2176,6 +2176,13 @@ config NEON | |||
2176 | Say Y to include support code for NEON, the ARMv7 Advanced SIMD | 2176 | Say Y to include support code for NEON, the ARMv7 Advanced SIMD |
2177 | Extension. | 2177 | Extension. |
2178 | 2178 | ||
2179 | config KERNEL_MODE_NEON | ||
2180 | bool "Support for NEON in kernel mode" | ||
2181 | default n | ||
2182 | depends on NEON | ||
2183 | help | ||
2184 | Say Y to include support for NEON in kernel mode. | ||
2185 | |||
2179 | endmenu | 2186 | endmenu |
2180 | 2187 | ||
2181 | menu "Userspace binary formats" | 2188 | menu "Userspace binary formats" |
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h new file mode 100644 index 000000000000..8f730fe70093 --- /dev/null +++ b/arch/arm/include/asm/neon.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * linux/arch/arm/include/asm/neon.h | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/hwcap.h> | ||
12 | |||
13 | #define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON)) | ||
14 | |||
15 | #ifdef __ARM_NEON__ | ||
16 | |||
17 | /* | ||
18 | * If you are affected by the BUILD_BUG below, it probably means that you are | ||
19 | * using NEON code /and/ calling the kernel_neon_begin() function from the same | ||
20 | * compilation unit. To prevent issues that may arise from GCC reordering or | ||
21 | * generating(1) NEON instructions outside of these begin/end functions, the | ||
22 | * only supported way of using NEON code in the kernel is by isolating it in a | ||
23 | * separate compilation unit, and calling it from another unit from inside a | ||
24 | * kernel_neon_begin/kernel_neon_end pair. | ||
25 | * | ||
26 | * (1) Current GCC (4.7) might generate NEON instructions at O3 level if | ||
27 | * -mpfu=neon is set. | ||
28 | */ | ||
29 | |||
30 | #define kernel_neon_begin() \ | ||
31 | BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code") | ||
32 | |||
33 | #else | ||
34 | void kernel_neon_begin(void); | ||
35 | #endif | ||
36 | void kernel_neon_end(void); | ||
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index 7604673dc427..4ffb26d4cad8 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h | |||
@@ -7,7 +7,10 @@ | |||
7 | * it under the terms of the GNU General Public License version 2 as | 7 | * it under the terms of the GNU General Public License version 2 as |
8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
9 | */ | 9 | */ |
10 | #include <linux/hardirq.h> | ||
10 | #include <asm-generic/xor.h> | 11 | #include <asm-generic/xor.h> |
12 | #include <asm/hwcap.h> | ||
13 | #include <asm/neon.h> | ||
11 | 14 | ||
12 | #define __XOR(a1, a2) a1 ^= a2 | 15 | #define __XOR(a1, a2) a1 ^= a2 |
13 | 16 | ||
@@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = { | |||
138 | xor_speed(&xor_block_arm4regs); \ | 141 | xor_speed(&xor_block_arm4regs); \ |
139 | xor_speed(&xor_block_8regs); \ | 142 | xor_speed(&xor_block_8regs); \ |
140 | xor_speed(&xor_block_32regs); \ | 143 | xor_speed(&xor_block_32regs); \ |
144 | NEON_TEMPLATES; \ | ||
141 | } while (0) | 145 | } while (0) |
146 | |||
147 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
148 | |||
149 | extern struct xor_block_template const xor_block_neon_inner; | ||
150 | |||
151 | static void | ||
152 | xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
153 | { | ||
154 | if (in_interrupt()) { | ||
155 | xor_arm4regs_2(bytes, p1, p2); | ||
156 | } else { | ||
157 | kernel_neon_begin(); | ||
158 | xor_block_neon_inner.do_2(bytes, p1, p2); | ||
159 | kernel_neon_end(); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | static void | ||
164 | xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
165 | unsigned long *p3) | ||
166 | { | ||
167 | if (in_interrupt()) { | ||
168 | xor_arm4regs_3(bytes, p1, p2, p3); | ||
169 | } else { | ||
170 | kernel_neon_begin(); | ||
171 | xor_block_neon_inner.do_3(bytes, p1, p2, p3); | ||
172 | kernel_neon_end(); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | static void | ||
177 | xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
178 | unsigned long *p3, unsigned long *p4) | ||
179 | { | ||
180 | if (in_interrupt()) { | ||
181 | xor_arm4regs_4(bytes, p1, p2, p3, p4); | ||
182 | } else { | ||
183 | kernel_neon_begin(); | ||
184 | xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4); | ||
185 | kernel_neon_end(); | ||
186 | } | ||
187 | } | ||
188 | |||
189 | static void | ||
190 | xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
191 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
192 | { | ||
193 | if (in_interrupt()) { | ||
194 | xor_arm4regs_5(bytes, p1, p2, p3, p4, p5); | ||
195 | } else { | ||
196 | kernel_neon_begin(); | ||
197 | xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5); | ||
198 | kernel_neon_end(); | ||
199 | } | ||
200 | } | ||
201 | |||
202 | static struct xor_block_template xor_block_neon = { | ||
203 | .name = "neon", | ||
204 | .do_2 = xor_neon_2, | ||
205 | .do_3 = xor_neon_3, | ||
206 | .do_4 = xor_neon_4, | ||
207 | .do_5 = xor_neon_5 | ||
208 | }; | ||
209 | |||
210 | #define NEON_TEMPLATES \ | ||
211 | do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0) | ||
212 | #else | ||
213 | #define NEON_TEMPLATES | ||
214 | #endif | ||
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index af72969820b4..aaf3a8731136 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile | |||
@@ -45,3 +45,9 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o | |||
45 | 45 | ||
46 | $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S | 46 | $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S |
47 | $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S | 47 | $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S |
48 | |||
49 | ifeq ($(CONFIG_KERNEL_MODE_NEON),y) | ||
50 | NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon | ||
51 | CFLAGS_xor-neon.o += $(NEON_FLAGS) | ||
52 | lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o | ||
53 | endif | ||
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c new file mode 100644 index 000000000000..f485e5a2af4b --- /dev/null +++ b/arch/arm/lib/xor-neon.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * linux/arch/arm/lib/xor-neon.c | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/raid/xor.h> | ||
12 | |||
13 | #ifndef __ARM_NEON__ | ||
14 | #error You should compile this file with '-mfloat-abi=softfp -mfpu=neon' | ||
15 | #endif | ||
16 | |||
17 | /* | ||
18 | * Pull in the reference implementations while instructing GCC (through | ||
19 | * -ftree-vectorize) to attempt to exploit implicit parallelism and emit | ||
20 | * NEON instructions. | ||
21 | */ | ||
22 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) | ||
23 | #pragma GCC optimize "tree-vectorize" | ||
24 | #else | ||
25 | /* | ||
26 | * While older versions of GCC do not generate incorrect code, they fail to | ||
27 | * recognize the parallel nature of these functions, and emit plain ARM code, | ||
28 | * which is known to be slower than the optimized ARM code in asm-arm/xor.h. | ||
29 | */ | ||
30 | #warning This code requires at least version 4.6 of GCC | ||
31 | #endif | ||
32 | |||
33 | #pragma GCC diagnostic ignored "-Wunused-variable" | ||
34 | #include <asm-generic/xor.h> | ||
35 | |||
36 | struct xor_block_template const xor_block_neon_inner = { | ||
37 | .name = "__inner_neon__", | ||
38 | .do_2 = xor_8regs_2, | ||
39 | .do_3 = xor_8regs_3, | ||
40 | .do_4 = xor_8regs_4, | ||
41 | .do_5 = xor_8regs_5, | ||
42 | }; | ||
diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S index 8d10dc8a1e17..3e5d3115a2a6 100644 --- a/arch/arm/vfp/vfphw.S +++ b/arch/arm/vfp/vfphw.S | |||
@@ -78,6 +78,11 @@ | |||
78 | ENTRY(vfp_support_entry) | 78 | ENTRY(vfp_support_entry) |
79 | DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 | 79 | DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 |
80 | 80 | ||
81 | ldr r3, [sp, #S_PSR] @ Neither lazy restore nor FP exceptions | ||
82 | and r3, r3, #MODE_MASK @ are supported in kernel mode | ||
83 | teq r3, #USR_MODE | ||
84 | bne vfp_kmode_exception @ Returns through lr | ||
85 | |||
81 | VFPFMRX r1, FPEXC @ Is the VFP enabled? | 86 | VFPFMRX r1, FPEXC @ Is the VFP enabled? |
82 | DBGSTR1 "fpexc %08x", r1 | 87 | DBGSTR1 "fpexc %08x", r1 |
83 | tst r1, #FPEXC_EN | 88 | tst r1, #FPEXC_EN |
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 5dfbb0b8e7f4..52b8f40b1c73 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/user.h> | 22 | #include <linux/user.h> |
23 | #include <linux/export.h> | ||
23 | 24 | ||
24 | #include <asm/cp15.h> | 25 | #include <asm/cp15.h> |
25 | #include <asm/cputype.h> | 26 | #include <asm/cputype.h> |
@@ -648,6 +649,72 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action, | |||
648 | return NOTIFY_OK; | 649 | return NOTIFY_OK; |
649 | } | 650 | } |
650 | 651 | ||
652 | void vfp_kmode_exception(void) | ||
653 | { | ||
654 | /* | ||
655 | * If we reach this point, a floating point exception has been raised | ||
656 | * while running in kernel mode. If the NEON/VFP unit was enabled at the | ||
657 | * time, it means a VFP instruction has been issued that requires | ||
658 | * software assistance to complete, something which is not currently | ||
659 | * supported in kernel mode. | ||
660 | * If the NEON/VFP unit was disabled, and the location pointed to below | ||
661 | * is properly preceded by a call to kernel_neon_begin(), something has | ||
662 | * caused the task to be scheduled out and back in again. In this case, | ||
663 | * rebuilding and running with CONFIG_DEBUG_ATOMIC_SLEEP enabled should | ||
664 | * be helpful in localizing the problem. | ||
665 | */ | ||
666 | if (fmrx(FPEXC) & FPEXC_EN) | ||
667 | pr_crit("BUG: unsupported FP instruction in kernel mode\n"); | ||
668 | else | ||
669 | pr_crit("BUG: FP instruction issued in kernel mode with FP unit disabled\n"); | ||
670 | } | ||
671 | |||
672 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
673 | |||
674 | /* | ||
675 | * Kernel-side NEON support functions | ||
676 | */ | ||
677 | void kernel_neon_begin(void) | ||
678 | { | ||
679 | struct thread_info *thread = current_thread_info(); | ||
680 | unsigned int cpu; | ||
681 | u32 fpexc; | ||
682 | |||
683 | /* | ||
684 | * Kernel mode NEON is only allowed outside of interrupt context | ||
685 | * with preemption disabled. This will make sure that the kernel | ||
686 | * mode NEON register contents never need to be preserved. | ||
687 | */ | ||
688 | BUG_ON(in_interrupt()); | ||
689 | cpu = get_cpu(); | ||
690 | |||
691 | fpexc = fmrx(FPEXC) | FPEXC_EN; | ||
692 | fmxr(FPEXC, fpexc); | ||
693 | |||
694 | /* | ||
695 | * Save the userland NEON/VFP state. Under UP, | ||
696 | * the owner could be a task other than 'current' | ||
697 | */ | ||
698 | if (vfp_state_in_hw(cpu, thread)) | ||
699 | vfp_save_state(&thread->vfpstate, fpexc); | ||
700 | #ifndef CONFIG_SMP | ||
701 | else if (vfp_current_hw_state[cpu] != NULL) | ||
702 | vfp_save_state(vfp_current_hw_state[cpu], fpexc); | ||
703 | #endif | ||
704 | vfp_current_hw_state[cpu] = NULL; | ||
705 | } | ||
706 | EXPORT_SYMBOL(kernel_neon_begin); | ||
707 | |||
708 | void kernel_neon_end(void) | ||
709 | { | ||
710 | /* Disable the NEON/VFP unit. */ | ||
711 | fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); | ||
712 | put_cpu(); | ||
713 | } | ||
714 | EXPORT_SYMBOL(kernel_neon_end); | ||
715 | |||
716 | #endif /* CONFIG_KERNEL_MODE_NEON */ | ||
717 | |||
651 | /* | 718 | /* |
652 | * VFP support code initialisation. | 719 | * VFP support code initialisation. |
653 | */ | 720 | */ |
@@ -731,4 +798,4 @@ static int __init vfp_init(void) | |||
731 | return 0; | 798 | return 0; |
732 | } | 799 | } |
733 | 800 | ||
734 | late_initcall(vfp_init); | 801 | core_initcall(vfp_init); |
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 8dfaa2ce2e95..0f424698064f 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h | |||
@@ -114,6 +114,11 @@ extern const struct raid6_recov_calls raid6_recov_intx1; | |||
114 | extern const struct raid6_recov_calls raid6_recov_ssse3; | 114 | extern const struct raid6_recov_calls raid6_recov_ssse3; |
115 | extern const struct raid6_recov_calls raid6_recov_avx2; | 115 | extern const struct raid6_recov_calls raid6_recov_avx2; |
116 | 116 | ||
117 | extern const struct raid6_calls raid6_neonx1; | ||
118 | extern const struct raid6_calls raid6_neonx2; | ||
119 | extern const struct raid6_calls raid6_neonx4; | ||
120 | extern const struct raid6_calls raid6_neonx8; | ||
121 | |||
117 | /* Algorithm list */ | 122 | /* Algorithm list */ |
118 | extern const struct raid6_calls * const raid6_algos[]; | 123 | extern const struct raid6_calls * const raid6_algos[]; |
119 | extern const struct raid6_recov_calls *const raid6_recov_algos[]; | 124 | extern const struct raid6_recov_calls *const raid6_recov_algos[]; |
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162becacf97c..0a7e494b2bcd 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore | |||
@@ -2,3 +2,4 @@ mktables | |||
2 | altivec*.c | 2 | altivec*.c |
3 | int*.c | 3 | int*.c |
4 | tables.c | 4 | tables.c |
5 | neon?.c | ||
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184725d7..b4625787c7ee 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile | |||
@@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ | |||
5 | 5 | ||
6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o | 6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o |
7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o | 7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o |
8 | raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o | ||
8 | 9 | ||
9 | hostprogs-y += mktables | 10 | hostprogs-y += mktables |
10 | 11 | ||
@@ -16,6 +17,21 @@ ifeq ($(CONFIG_ALTIVEC),y) | |||
16 | altivec_flags := -maltivec -mabi=altivec | 17 | altivec_flags := -maltivec -mabi=altivec |
17 | endif | 18 | endif |
18 | 19 | ||
20 | # The GCC option -ffreestanding is required in order to compile code containing | ||
21 | # ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) | ||
22 | ifeq ($(CONFIG_KERNEL_MODE_NEON),y) | ||
23 | NEON_FLAGS := -ffreestanding | ||
24 | ifeq ($(ARCH),arm) | ||
25 | NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon | ||
26 | endif | ||
27 | ifeq ($(ARCH),arm64) | ||
28 | CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only | ||
29 | CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only | ||
30 | CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only | ||
31 | CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only | ||
32 | endif | ||
33 | endif | ||
34 | |||
19 | targets += int1.c | 35 | targets += int1.c |
20 | $(obj)/int1.c: UNROLL := 1 | 36 | $(obj)/int1.c: UNROLL := 1 |
21 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE | 37 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE |
@@ -70,6 +86,30 @@ $(obj)/altivec8.c: UNROLL := 8 | |||
70 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE | 86 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE |
71 | $(call if_changed,unroll) | 87 | $(call if_changed,unroll) |
72 | 88 | ||
89 | CFLAGS_neon1.o += $(NEON_FLAGS) | ||
90 | targets += neon1.c | ||
91 | $(obj)/neon1.c: UNROLL := 1 | ||
92 | $(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
93 | $(call if_changed,unroll) | ||
94 | |||
95 | CFLAGS_neon2.o += $(NEON_FLAGS) | ||
96 | targets += neon2.c | ||
97 | $(obj)/neon2.c: UNROLL := 2 | ||
98 | $(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
99 | $(call if_changed,unroll) | ||
100 | |||
101 | CFLAGS_neon4.o += $(NEON_FLAGS) | ||
102 | targets += neon4.c | ||
103 | $(obj)/neon4.c: UNROLL := 4 | ||
104 | $(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
105 | $(call if_changed,unroll) | ||
106 | |||
107 | CFLAGS_neon8.o += $(NEON_FLAGS) | ||
108 | targets += neon8.c | ||
109 | $(obj)/neon8.c: UNROLL := 8 | ||
110 | $(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
111 | $(call if_changed,unroll) | ||
112 | |||
73 | quiet_cmd_mktable = TABLE $@ | 113 | quiet_cmd_mktable = TABLE $@ |
74 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) | 114 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) |
75 | 115 | ||
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316fe9f30..74e6f5629dbc 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
@@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = { | |||
70 | &raid6_intx2, | 70 | &raid6_intx2, |
71 | &raid6_intx4, | 71 | &raid6_intx4, |
72 | &raid6_intx8, | 72 | &raid6_intx8, |
73 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
74 | &raid6_neonx1, | ||
75 | &raid6_neonx2, | ||
76 | &raid6_neonx4, | ||
77 | &raid6_neonx8, | ||
78 | #endif | ||
73 | NULL | 79 | NULL |
74 | }; | 80 | }; |
75 | 81 | ||
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 000000000000..36ad4705df1a --- /dev/null +++ b/lib/raid6/neon.c | |||
@@ -0,0 +1,58 @@ | |||
1 | /* | ||
2 | * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/raid/pq.h> | ||
12 | |||
13 | #ifdef __KERNEL__ | ||
14 | #include <asm/neon.h> | ||
15 | #else | ||
16 | #define kernel_neon_begin() | ||
17 | #define kernel_neon_end() | ||
18 | #define cpu_has_neon() (1) | ||
19 | #endif | ||
20 | |||
21 | /* | ||
22 | * There are 2 reasons these wrappers are kept in a separate compilation unit | ||
23 | * from the actual implementations in neonN.c (generated from neon.uc by | ||
24 | * unroll.awk): | ||
25 | * - the actual implementations use NEON intrinsics, and the GCC support header | ||
26 | * (arm_neon.h) is not fully compatible (type wise) with the kernel; | ||
27 | * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, | ||
28 | * and we have to make sure that we never use *any* NEON/VFP instructions | ||
29 | * outside a kernel_neon_begin()/kernel_neon_end() pair. | ||
30 | */ | ||
31 | |||
32 | #define RAID6_NEON_WRAPPER(_n) \ | ||
33 | static void raid6_neon ## _n ## _gen_syndrome(int disks, \ | ||
34 | size_t bytes, void **ptrs) \ | ||
35 | { \ | ||
36 | void raid6_neon ## _n ## _gen_syndrome_real(int, \ | ||
37 | unsigned long, void**); \ | ||
38 | kernel_neon_begin(); \ | ||
39 | raid6_neon ## _n ## _gen_syndrome_real(disks, \ | ||
40 | (unsigned long)bytes, ptrs); \ | ||
41 | kernel_neon_end(); \ | ||
42 | } \ | ||
43 | struct raid6_calls const raid6_neonx ## _n = { \ | ||
44 | raid6_neon ## _n ## _gen_syndrome, \ | ||
45 | raid6_have_neon, \ | ||
46 | "neonx" #_n, \ | ||
47 | 0 \ | ||
48 | } | ||
49 | |||
50 | static int raid6_have_neon(void) | ||
51 | { | ||
52 | return cpu_has_neon(); | ||
53 | } | ||
54 | |||
55 | RAID6_NEON_WRAPPER(1); | ||
56 | RAID6_NEON_WRAPPER(2); | ||
57 | RAID6_NEON_WRAPPER(4); | ||
58 | RAID6_NEON_WRAPPER(8); | ||
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 000000000000..1b9ed793342d --- /dev/null +++ b/lib/raid6/neon.uc | |||
@@ -0,0 +1,80 @@ | |||
1 | /* ----------------------------------------------------------------------- | ||
2 | * | ||
3 | * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions | ||
4 | * | ||
5 | * Copyright (C) 2012 Rob Herring | ||
6 | * | ||
7 | * Based on altivec.uc: | ||
8 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
13 | * Boston MA 02111-1307, USA; either version 2 of the License, or | ||
14 | * (at your option) any later version; incorporated herein by reference. | ||
15 | * | ||
16 | * ----------------------------------------------------------------------- */ | ||
17 | |||
18 | /* | ||
19 | * neon$#.c | ||
20 | * | ||
21 | * $#-way unrolled NEON intrinsics math RAID-6 instruction set | ||
22 | * | ||
23 | * This file is postprocessed using unroll.awk | ||
24 | */ | ||
25 | |||
26 | #include <arm_neon.h> | ||
27 | |||
28 | typedef uint8x16_t unative_t; | ||
29 | |||
30 | #define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) | ||
31 | #define NSIZE sizeof(unative_t) | ||
32 | |||
33 | /* | ||
34 | * The SHLBYTE() operation shifts each byte left by 1, *not* | ||
35 | * rolling over into the next byte | ||
36 | */ | ||
37 | static inline unative_t SHLBYTE(unative_t v) | ||
38 | { | ||
39 | return vshlq_n_u8(v, 1); | ||
40 | } | ||
41 | |||
42 | /* | ||
43 | * The MASK() operation returns 0xFF in any byte for which the high | ||
44 | * bit is 1, 0x00 for any byte for which the high bit is 0. | ||
45 | */ | ||
46 | static inline unative_t MASK(unative_t v) | ||
47 | { | ||
48 | const uint8x16_t temp = NBYTES(0); | ||
49 | return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); | ||
50 | } | ||
51 | |||
52 | void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) | ||
53 | { | ||
54 | uint8_t **dptr = (uint8_t **)ptrs; | ||
55 | uint8_t *p, *q; | ||
56 | int d, z, z0; | ||
57 | |||
58 | register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
59 | const unative_t x1d = NBYTES(0x1d); | ||
60 | |||
61 | z0 = disks - 3; /* Highest data disk */ | ||
62 | p = dptr[z0+1]; /* XOR parity */ | ||
63 | q = dptr[z0+2]; /* RS syndrome */ | ||
64 | |||
65 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
66 | wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); | ||
67 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
68 | wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); | ||
69 | wp$$ = veorq_u8(wp$$, wd$$); | ||
70 | w2$$ = MASK(wq$$); | ||
71 | w1$$ = SHLBYTE(wq$$); | ||
72 | |||
73 | w2$$ = vandq_u8(w2$$, x1d); | ||
74 | w1$$ = veorq_u8(w1$$, w2$$); | ||
75 | wq$$ = veorq_u8(w1$$, wd$$); | ||
76 | } | ||
77 | vst1q_u8(&p[d+NSIZE*$$], wp$$); | ||
78 | vst1q_u8(&q[d+NSIZE*$$], wq$$); | ||
79 | } | ||
80 | } | ||
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332dbf8aa..28afa1a06e03 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile | |||
@@ -22,11 +22,23 @@ ifeq ($(ARCH),x86_64) | |||
22 | IS_X86 = yes | 22 | IS_X86 = yes |
23 | endif | 23 | endif |
24 | 24 | ||
25 | ifeq ($(ARCH),arm) | ||
26 | CFLAGS += -I../../../arch/arm/include -mfpu=neon | ||
27 | HAS_NEON = yes | ||
28 | endif | ||
29 | ifeq ($(ARCH),arm64) | ||
30 | CFLAGS += -I../../../arch/arm64/include | ||
31 | HAS_NEON = yes | ||
32 | endif | ||
33 | |||
25 | ifeq ($(IS_X86),yes) | 34 | ifeq ($(IS_X86),yes) |
26 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o | 35 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o |
27 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ | 36 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ |
28 | gcc -c -x assembler - >&/dev/null && \ | 37 | gcc -c -x assembler - >&/dev/null && \ |
29 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) | 38 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) |
39 | else ifeq ($(HAS_NEON),yes) | ||
40 | OBJS += neon.o neon1.o neon2.o neon4.o neon8.o | ||
41 | CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 | ||
30 | else | 42 | else |
31 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ | 43 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ |
32 | gcc -c -x c - >&/dev/null && \ | 44 | gcc -c -x c - >&/dev/null && \ |
@@ -55,6 +67,18 @@ raid6.a: $(OBJS) | |||
55 | raid6test: test.c raid6.a | 67 | raid6test: test.c raid6.a |
56 | $(CC) $(CFLAGS) -o raid6test $^ | 68 | $(CC) $(CFLAGS) -o raid6test $^ |
57 | 69 | ||
70 | neon1.c: neon.uc ../unroll.awk | ||
71 | $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ | ||
72 | |||
73 | neon2.c: neon.uc ../unroll.awk | ||
74 | $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ | ||
75 | |||
76 | neon4.c: neon.uc ../unroll.awk | ||
77 | $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ | ||
78 | |||
79 | neon8.c: neon.uc ../unroll.awk | ||
80 | $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ | ||
81 | |||
58 | altivec1.c: altivec.uc ../unroll.awk | 82 | altivec1.c: altivec.uc ../unroll.awk |
59 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ | 83 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ |
60 | 84 | ||
@@ -89,7 +113,7 @@ tables.c: mktables | |||
89 | ./mktables > tables.c | 113 | ./mktables > tables.c |
90 | 114 | ||
91 | clean: | 115 | clean: |
92 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test | 116 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test |
93 | 117 | ||
94 | spotless: clean | 118 | spotless: clean |
95 | rm -f *~ | 119 | rm -f *~ |