aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRussell King <rmk+kernel@arm.linux.org.uk>2013-07-22 12:26:27 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2013-07-22 12:46:40 -0400
commitb4f656eea63376da79b0b5a17660c4ce14b71b74 (patch)
tree36731cf326eca19cd2164f886b3eaa31449fe8fa
parent3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff)
parent7d11965ddb9b9b1e0a5d13c58345ada1ccbc663b (diff)
Pull branch 'for-rmk' of git://git.linaro.org/people/ardbiesheuvel/linux-arm into devel-stable
Comments from Ard Biesheuvel: I have included two use cases that I have been using, XOR and RAID-6 checksumming. The former gets a 60% performance boost on the NEON, the latter over 400%. ARM: add support for kernel mode NEON Adds kernel_neon_begin/end (renamed from kernel_vfp_begin/end in the previous version to de-emphasize the VFP part as VFP code that needs software assistance is not supported currently.) Introduces <asm/neon.h> and the Kconfig symbol KERNEL_MODE_NEON. This has been aligned with Catalin for arm64, so any NEON code that does not use assembly but intrinsics or the GCC vectorizer (such as my examples) can potentially be shared between arm and arm64 archs. ARM: move VFP init to an earlier boot stage This is needed so the NEON is enabled when the XOR and RAID-6 algo boot time benchmarks are run. ARM: be strict about FP exceptions in kernel mode This adds a check to vfp_support_entry() to flag unsupported uses of the NEON/VFP in kernel mode. FP exceptions (bounces) are flagged as a bug, this is because of their potentially intermittent nature. Exceptions caused by the fact that kernel_neon_begin has not been called are just routed through the undef handler. ARM: crypto: add NEON accelerated XOR implementation This is the xor_blocks() implementation built with -ftree-vectorize, 60% faster than optimized ARM code. It calls in_interrupt() to check whether the NEON flavor can be used: this should really not be necessary, but due to xor_blocks'squite generic nature, there is no telling how exactly people may be using it in the real world. lib/raid6: add ARM-NEON accelerated syndrome calculation This is a port of the RAID-6 checksumming code in altivec.uc ported to use NEON intrinsics. It is about 4x faster than the sequential code.
-rw-r--r--arch/arm/Kconfig7
-rw-r--r--arch/arm/include/asm/neon.h36
-rw-r--r--arch/arm/include/asm/xor.h73
-rw-r--r--arch/arm/lib/Makefile6
-rw-r--r--arch/arm/lib/xor-neon.c42
-rw-r--r--arch/arm/vfp/vfphw.S5
-rw-r--r--arch/arm/vfp/vfpmodule.c69
-rw-r--r--include/linux/raid/pq.h5
-rw-r--r--lib/raid6/.gitignore1
-rw-r--r--lib/raid6/Makefile40
-rw-r--r--lib/raid6/algos.c6
-rw-r--r--lib/raid6/neon.c58
-rw-r--r--lib/raid6/neon.uc80
-rw-r--r--lib/raid6/test/Makefile26
14 files changed, 452 insertions, 2 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ba412e02ec0c..ccc388d388be 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2176,6 +2176,13 @@ config NEON
2176 Say Y to include support code for NEON, the ARMv7 Advanced SIMD 2176 Say Y to include support code for NEON, the ARMv7 Advanced SIMD
2177 Extension. 2177 Extension.
2178 2178
2179config KERNEL_MODE_NEON
2180 bool "Support for NEON in kernel mode"
2181 default n
2182 depends on NEON
2183 help
2184 Say Y to include support for NEON in kernel mode.
2185
2179endmenu 2186endmenu
2180 2187
2181menu "Userspace binary formats" 2188menu "Userspace binary formats"
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
new file mode 100644
index 000000000000..8f730fe70093
--- /dev/null
+++ b/arch/arm/include/asm/neon.h
@@ -0,0 +1,36 @@
1/*
2 * linux/arch/arm/include/asm/neon.h
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/hwcap.h>
12
13#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
14
15#ifdef __ARM_NEON__
16
17/*
18 * If you are affected by the BUILD_BUG below, it probably means that you are
19 * using NEON code /and/ calling the kernel_neon_begin() function from the same
20 * compilation unit. To prevent issues that may arise from GCC reordering or
21 * generating(1) NEON instructions outside of these begin/end functions, the
22 * only supported way of using NEON code in the kernel is by isolating it in a
23 * separate compilation unit, and calling it from another unit from inside a
24 * kernel_neon_begin/kernel_neon_end pair.
25 *
26 * (1) Current GCC (4.7) might generate NEON instructions at O3 level if
27 * -mpfu=neon is set.
28 */
29
30#define kernel_neon_begin() \
31 BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
32
33#else
34void kernel_neon_begin(void);
35#endif
36void kernel_neon_end(void);
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 7604673dc427..4ffb26d4cad8 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -7,7 +7,10 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/hardirq.h>
10#include <asm-generic/xor.h> 11#include <asm-generic/xor.h>
12#include <asm/hwcap.h>
13#include <asm/neon.h>
11 14
12#define __XOR(a1, a2) a1 ^= a2 15#define __XOR(a1, a2) a1 ^= a2
13 16
@@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = {
138 xor_speed(&xor_block_arm4regs); \ 141 xor_speed(&xor_block_arm4regs); \
139 xor_speed(&xor_block_8regs); \ 142 xor_speed(&xor_block_8regs); \
140 xor_speed(&xor_block_32regs); \ 143 xor_speed(&xor_block_32regs); \
144 NEON_TEMPLATES; \
141 } while (0) 145 } while (0)
146
147#ifdef CONFIG_KERNEL_MODE_NEON
148
149extern struct xor_block_template const xor_block_neon_inner;
150
151static void
152xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
153{
154 if (in_interrupt()) {
155 xor_arm4regs_2(bytes, p1, p2);
156 } else {
157 kernel_neon_begin();
158 xor_block_neon_inner.do_2(bytes, p1, p2);
159 kernel_neon_end();
160 }
161}
162
163static void
164xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
165 unsigned long *p3)
166{
167 if (in_interrupt()) {
168 xor_arm4regs_3(bytes, p1, p2, p3);
169 } else {
170 kernel_neon_begin();
171 xor_block_neon_inner.do_3(bytes, p1, p2, p3);
172 kernel_neon_end();
173 }
174}
175
176static void
177xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
178 unsigned long *p3, unsigned long *p4)
179{
180 if (in_interrupt()) {
181 xor_arm4regs_4(bytes, p1, p2, p3, p4);
182 } else {
183 kernel_neon_begin();
184 xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
185 kernel_neon_end();
186 }
187}
188
189static void
190xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
191 unsigned long *p3, unsigned long *p4, unsigned long *p5)
192{
193 if (in_interrupt()) {
194 xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
195 } else {
196 kernel_neon_begin();
197 xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
198 kernel_neon_end();
199 }
200}
201
202static struct xor_block_template xor_block_neon = {
203 .name = "neon",
204 .do_2 = xor_neon_2,
205 .do_3 = xor_neon_3,
206 .do_4 = xor_neon_4,
207 .do_5 = xor_neon_5
208};
209
210#define NEON_TEMPLATES \
211 do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
212#else
213#define NEON_TEMPLATES
214#endif
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index af72969820b4..aaf3a8731136 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -45,3 +45,9 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o
45 45
46$(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S 46$(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
47$(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S 47$(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S
48
49ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
50 NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon
51 CFLAGS_xor-neon.o += $(NEON_FLAGS)
52 lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o
53endif
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
new file mode 100644
index 000000000000..f485e5a2af4b
--- /dev/null
+++ b/arch/arm/lib/xor-neon.c
@@ -0,0 +1,42 @@
1/*
2 * linux/arch/arm/lib/xor-neon.c
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/raid/xor.h>
12
13#ifndef __ARM_NEON__
14#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
15#endif
16
17/*
18 * Pull in the reference implementations while instructing GCC (through
19 * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
20 * NEON instructions.
21 */
22#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
23#pragma GCC optimize "tree-vectorize"
24#else
25/*
26 * While older versions of GCC do not generate incorrect code, they fail to
27 * recognize the parallel nature of these functions, and emit plain ARM code,
28 * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
29 */
30#warning This code requires at least version 4.6 of GCC
31#endif
32
33#pragma GCC diagnostic ignored "-Wunused-variable"
34#include <asm-generic/xor.h>
35
36struct xor_block_template const xor_block_neon_inner = {
37 .name = "__inner_neon__",
38 .do_2 = xor_8regs_2,
39 .do_3 = xor_8regs_3,
40 .do_4 = xor_8regs_4,
41 .do_5 = xor_8regs_5,
42};
diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S
index 8d10dc8a1e17..3e5d3115a2a6 100644
--- a/arch/arm/vfp/vfphw.S
+++ b/arch/arm/vfp/vfphw.S
@@ -78,6 +78,11 @@
78ENTRY(vfp_support_entry) 78ENTRY(vfp_support_entry)
79 DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 79 DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10
80 80
81 ldr r3, [sp, #S_PSR] @ Neither lazy restore nor FP exceptions
82 and r3, r3, #MODE_MASK @ are supported in kernel mode
83 teq r3, #USR_MODE
84 bne vfp_kmode_exception @ Returns through lr
85
81 VFPFMRX r1, FPEXC @ Is the VFP enabled? 86 VFPFMRX r1, FPEXC @ Is the VFP enabled?
82 DBGSTR1 "fpexc %08x", r1 87 DBGSTR1 "fpexc %08x", r1
83 tst r1, #FPEXC_EN 88 tst r1, #FPEXC_EN
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 5dfbb0b8e7f4..52b8f40b1c73 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -20,6 +20,7 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/export.h>
23 24
24#include <asm/cp15.h> 25#include <asm/cp15.h>
25#include <asm/cputype.h> 26#include <asm/cputype.h>
@@ -648,6 +649,72 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action,
648 return NOTIFY_OK; 649 return NOTIFY_OK;
649} 650}
650 651
652void vfp_kmode_exception(void)
653{
654 /*
655 * If we reach this point, a floating point exception has been raised
656 * while running in kernel mode. If the NEON/VFP unit was enabled at the
657 * time, it means a VFP instruction has been issued that requires
658 * software assistance to complete, something which is not currently
659 * supported in kernel mode.
660 * If the NEON/VFP unit was disabled, and the location pointed to below
661 * is properly preceded by a call to kernel_neon_begin(), something has
662 * caused the task to be scheduled out and back in again. In this case,
663 * rebuilding and running with CONFIG_DEBUG_ATOMIC_SLEEP enabled should
664 * be helpful in localizing the problem.
665 */
666 if (fmrx(FPEXC) & FPEXC_EN)
667 pr_crit("BUG: unsupported FP instruction in kernel mode\n");
668 else
669 pr_crit("BUG: FP instruction issued in kernel mode with FP unit disabled\n");
670}
671
672#ifdef CONFIG_KERNEL_MODE_NEON
673
674/*
675 * Kernel-side NEON support functions
676 */
677void kernel_neon_begin(void)
678{
679 struct thread_info *thread = current_thread_info();
680 unsigned int cpu;
681 u32 fpexc;
682
683 /*
684 * Kernel mode NEON is only allowed outside of interrupt context
685 * with preemption disabled. This will make sure that the kernel
686 * mode NEON register contents never need to be preserved.
687 */
688 BUG_ON(in_interrupt());
689 cpu = get_cpu();
690
691 fpexc = fmrx(FPEXC) | FPEXC_EN;
692 fmxr(FPEXC, fpexc);
693
694 /*
695 * Save the userland NEON/VFP state. Under UP,
696 * the owner could be a task other than 'current'
697 */
698 if (vfp_state_in_hw(cpu, thread))
699 vfp_save_state(&thread->vfpstate, fpexc);
700#ifndef CONFIG_SMP
701 else if (vfp_current_hw_state[cpu] != NULL)
702 vfp_save_state(vfp_current_hw_state[cpu], fpexc);
703#endif
704 vfp_current_hw_state[cpu] = NULL;
705}
706EXPORT_SYMBOL(kernel_neon_begin);
707
708void kernel_neon_end(void)
709{
710 /* Disable the NEON/VFP unit. */
711 fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
712 put_cpu();
713}
714EXPORT_SYMBOL(kernel_neon_end);
715
716#endif /* CONFIG_KERNEL_MODE_NEON */
717
651/* 718/*
652 * VFP support code initialisation. 719 * VFP support code initialisation.
653 */ 720 */
@@ -731,4 +798,4 @@ static int __init vfp_init(void)
731 return 0; 798 return 0;
732} 799}
733 800
734late_initcall(vfp_init); 801core_initcall(vfp_init);
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 8dfaa2ce2e95..0f424698064f 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -114,6 +114,11 @@ extern const struct raid6_recov_calls raid6_recov_intx1;
114extern const struct raid6_recov_calls raid6_recov_ssse3; 114extern const struct raid6_recov_calls raid6_recov_ssse3;
115extern const struct raid6_recov_calls raid6_recov_avx2; 115extern const struct raid6_recov_calls raid6_recov_avx2;
116 116
117extern const struct raid6_calls raid6_neonx1;
118extern const struct raid6_calls raid6_neonx2;
119extern const struct raid6_calls raid6_neonx4;
120extern const struct raid6_calls raid6_neonx8;
121
117/* Algorithm list */ 122/* Algorithm list */
118extern const struct raid6_calls * const raid6_algos[]; 123extern const struct raid6_calls * const raid6_algos[];
119extern const struct raid6_recov_calls *const raid6_recov_algos[]; 124extern const struct raid6_recov_calls *const raid6_recov_algos[];
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore
index 162becacf97c..0a7e494b2bcd 100644
--- a/lib/raid6/.gitignore
+++ b/lib/raid6/.gitignore
@@ -2,3 +2,4 @@ mktables
2altivec*.c 2altivec*.c
3int*.c 3int*.c
4tables.c 4tables.c
5neon?.c
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 9f7c184725d7..b4625787c7ee 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
5 5
6raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o 6raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
7raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o 7raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
8raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
8 9
9hostprogs-y += mktables 10hostprogs-y += mktables
10 11
@@ -16,6 +17,21 @@ ifeq ($(CONFIG_ALTIVEC),y)
16altivec_flags := -maltivec -mabi=altivec 17altivec_flags := -maltivec -mabi=altivec
17endif 18endif
18 19
20# The GCC option -ffreestanding is required in order to compile code containing
21# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
22ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
23NEON_FLAGS := -ffreestanding
24ifeq ($(ARCH),arm)
25NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
26endif
27ifeq ($(ARCH),arm64)
28CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
29CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
30CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
31CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
32endif
33endif
34
19targets += int1.c 35targets += int1.c
20$(obj)/int1.c: UNROLL := 1 36$(obj)/int1.c: UNROLL := 1
21$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE 37$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE
@@ -70,6 +86,30 @@ $(obj)/altivec8.c: UNROLL := 8
70$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE 86$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE
71 $(call if_changed,unroll) 87 $(call if_changed,unroll)
72 88
89CFLAGS_neon1.o += $(NEON_FLAGS)
90targets += neon1.c
91$(obj)/neon1.c: UNROLL := 1
92$(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE
93 $(call if_changed,unroll)
94
95CFLAGS_neon2.o += $(NEON_FLAGS)
96targets += neon2.c
97$(obj)/neon2.c: UNROLL := 2
98$(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE
99 $(call if_changed,unroll)
100
101CFLAGS_neon4.o += $(NEON_FLAGS)
102targets += neon4.c
103$(obj)/neon4.c: UNROLL := 4
104$(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE
105 $(call if_changed,unroll)
106
107CFLAGS_neon8.o += $(NEON_FLAGS)
108targets += neon8.c
109$(obj)/neon8.c: UNROLL := 8
110$(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE
111 $(call if_changed,unroll)
112
73quiet_cmd_mktable = TABLE $@ 113quiet_cmd_mktable = TABLE $@
74 cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) 114 cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
75 115
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 6d7316fe9f30..74e6f5629dbc 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = {
70 &raid6_intx2, 70 &raid6_intx2,
71 &raid6_intx4, 71 &raid6_intx4,
72 &raid6_intx8, 72 &raid6_intx8,
73#ifdef CONFIG_KERNEL_MODE_NEON
74 &raid6_neonx1,
75 &raid6_neonx2,
76 &raid6_neonx4,
77 &raid6_neonx8,
78#endif
73 NULL 79 NULL
74}; 80};
75 81
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
new file mode 100644
index 000000000000..36ad4705df1a
--- /dev/null
+++ b/lib/raid6/neon.c
@@ -0,0 +1,58 @@
1/*
2 * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/raid/pq.h>
12
13#ifdef __KERNEL__
14#include <asm/neon.h>
15#else
16#define kernel_neon_begin()
17#define kernel_neon_end()
18#define cpu_has_neon() (1)
19#endif
20
21/*
22 * There are 2 reasons these wrappers are kept in a separate compilation unit
23 * from the actual implementations in neonN.c (generated from neon.uc by
24 * unroll.awk):
25 * - the actual implementations use NEON intrinsics, and the GCC support header
26 * (arm_neon.h) is not fully compatible (type wise) with the kernel;
27 * - the neonN.c files are compiled with -mfpu=neon and optimization enabled,
28 * and we have to make sure that we never use *any* NEON/VFP instructions
29 * outside a kernel_neon_begin()/kernel_neon_end() pair.
30 */
31
32#define RAID6_NEON_WRAPPER(_n) \
33 static void raid6_neon ## _n ## _gen_syndrome(int disks, \
34 size_t bytes, void **ptrs) \
35 { \
36 void raid6_neon ## _n ## _gen_syndrome_real(int, \
37 unsigned long, void**); \
38 kernel_neon_begin(); \
39 raid6_neon ## _n ## _gen_syndrome_real(disks, \
40 (unsigned long)bytes, ptrs); \
41 kernel_neon_end(); \
42 } \
43 struct raid6_calls const raid6_neonx ## _n = { \
44 raid6_neon ## _n ## _gen_syndrome, \
45 raid6_have_neon, \
46 "neonx" #_n, \
47 0 \
48 }
49
50static int raid6_have_neon(void)
51{
52 return cpu_has_neon();
53}
54
55RAID6_NEON_WRAPPER(1);
56RAID6_NEON_WRAPPER(2);
57RAID6_NEON_WRAPPER(4);
58RAID6_NEON_WRAPPER(8);
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
new file mode 100644
index 000000000000..1b9ed793342d
--- /dev/null
+++ b/lib/raid6/neon.uc
@@ -0,0 +1,80 @@
1/* -----------------------------------------------------------------------
2 *
3 * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
4 *
5 * Copyright (C) 2012 Rob Herring
6 *
7 * Based on altivec.uc:
8 * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
13 * Boston MA 02111-1307, USA; either version 2 of the License, or
14 * (at your option) any later version; incorporated herein by reference.
15 *
16 * ----------------------------------------------------------------------- */
17
18/*
19 * neon$#.c
20 *
21 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
22 *
23 * This file is postprocessed using unroll.awk
24 */
25
26#include <arm_neon.h>
27
28typedef uint8x16_t unative_t;
29
30#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
31#define NSIZE sizeof(unative_t)
32
33/*
34 * The SHLBYTE() operation shifts each byte left by 1, *not*
35 * rolling over into the next byte
36 */
37static inline unative_t SHLBYTE(unative_t v)
38{
39 return vshlq_n_u8(v, 1);
40}
41
42/*
43 * The MASK() operation returns 0xFF in any byte for which the high
44 * bit is 1, 0x00 for any byte for which the high bit is 0.
45 */
46static inline unative_t MASK(unative_t v)
47{
48 const uint8x16_t temp = NBYTES(0);
49 return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp);
50}
51
52void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
53{
54 uint8_t **dptr = (uint8_t **)ptrs;
55 uint8_t *p, *q;
56 int d, z, z0;
57
58 register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
59 const unative_t x1d = NBYTES(0x1d);
60
61 z0 = disks - 3; /* Highest data disk */
62 p = dptr[z0+1]; /* XOR parity */
63 q = dptr[z0+2]; /* RS syndrome */
64
65 for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
66 wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
67 for ( z = z0-1 ; z >= 0 ; z-- ) {
68 wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
69 wp$$ = veorq_u8(wp$$, wd$$);
70 w2$$ = MASK(wq$$);
71 w1$$ = SHLBYTE(wq$$);
72
73 w2$$ = vandq_u8(w2$$, x1d);
74 w1$$ = veorq_u8(w1$$, w2$$);
75 wq$$ = veorq_u8(w1$$, wd$$);
76 }
77 vst1q_u8(&p[d+NSIZE*$$], wp$$);
78 vst1q_u8(&q[d+NSIZE*$$], wq$$);
79 }
80}
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 087332dbf8aa..28afa1a06e03 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -22,11 +22,23 @@ ifeq ($(ARCH),x86_64)
22 IS_X86 = yes 22 IS_X86 = yes
23endif 23endif
24 24
25ifeq ($(ARCH),arm)
26 CFLAGS += -I../../../arch/arm/include -mfpu=neon
27 HAS_NEON = yes
28endif
29ifeq ($(ARCH),arm64)
30 CFLAGS += -I../../../arch/arm64/include
31 HAS_NEON = yes
32endif
33
25ifeq ($(IS_X86),yes) 34ifeq ($(IS_X86),yes)
26 OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o 35 OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
27 CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ 36 CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \
28 gcc -c -x assembler - >&/dev/null && \ 37 gcc -c -x assembler - >&/dev/null && \
29 rm ./-.o && echo -DCONFIG_AS_AVX2=1) 38 rm ./-.o && echo -DCONFIG_AS_AVX2=1)
39else ifeq ($(HAS_NEON),yes)
40 OBJS += neon.o neon1.o neon2.o neon4.o neon8.o
41 CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
30else 42else
31 HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ 43 HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\
32 gcc -c -x c - >&/dev/null && \ 44 gcc -c -x c - >&/dev/null && \
@@ -55,6 +67,18 @@ raid6.a: $(OBJS)
55raid6test: test.c raid6.a 67raid6test: test.c raid6.a
56 $(CC) $(CFLAGS) -o raid6test $^ 68 $(CC) $(CFLAGS) -o raid6test $^
57 69
70neon1.c: neon.uc ../unroll.awk
71 $(AWK) ../unroll.awk -vN=1 < neon.uc > $@
72
73neon2.c: neon.uc ../unroll.awk
74 $(AWK) ../unroll.awk -vN=2 < neon.uc > $@
75
76neon4.c: neon.uc ../unroll.awk
77 $(AWK) ../unroll.awk -vN=4 < neon.uc > $@
78
79neon8.c: neon.uc ../unroll.awk
80 $(AWK) ../unroll.awk -vN=8 < neon.uc > $@
81
58altivec1.c: altivec.uc ../unroll.awk 82altivec1.c: altivec.uc ../unroll.awk
59 $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ 83 $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@
60 84
@@ -89,7 +113,7 @@ tables.c: mktables
89 ./mktables > tables.c 113 ./mktables > tables.c
90 114
91clean: 115clean:
92 rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test 116 rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test
93 117
94spotless: clean 118spotless: clean
95 rm -f *~ 119 rm -f *~