aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/include
diff options
context:
space:
mode:
authorRussell King <rmk+kernel@arm.linux.org.uk>2013-07-22 12:26:27 -0400
committerRussell King <rmk+kernel@arm.linux.org.uk>2013-07-22 12:46:40 -0400
commitb4f656eea63376da79b0b5a17660c4ce14b71b74 (patch)
tree36731cf326eca19cd2164f886b3eaa31449fe8fa /arch/arm/include
parent3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff)
parent7d11965ddb9b9b1e0a5d13c58345ada1ccbc663b (diff)
Pull branch 'for-rmk' of git://git.linaro.org/people/ardbiesheuvel/linux-arm into devel-stable
Comments from Ard Biesheuvel: I have included two use cases that I have been using, XOR and RAID-6 checksumming. The former gets a 60% performance boost on the NEON, the latter over 400%. ARM: add support for kernel mode NEON Adds kernel_neon_begin/end (renamed from kernel_vfp_begin/end in the previous version to de-emphasize the VFP part as VFP code that needs software assistance is not supported currently.) Introduces <asm/neon.h> and the Kconfig symbol KERNEL_MODE_NEON. This has been aligned with Catalin for arm64, so any NEON code that does not use assembly but intrinsics or the GCC vectorizer (such as my examples) can potentially be shared between arm and arm64 archs. ARM: move VFP init to an earlier boot stage This is needed so the NEON is enabled when the XOR and RAID-6 algo boot time benchmarks are run. ARM: be strict about FP exceptions in kernel mode This adds a check to vfp_support_entry() to flag unsupported uses of the NEON/VFP in kernel mode. FP exceptions (bounces) are flagged as a bug, this is because of their potentially intermittent nature. Exceptions caused by the fact that kernel_neon_begin has not been called are just routed through the undef handler. ARM: crypto: add NEON accelerated XOR implementation This is the xor_blocks() implementation built with -ftree-vectorize, 60% faster than optimized ARM code. It calls in_interrupt() to check whether the NEON flavor can be used: this should really not be necessary, but due to xor_blocks'squite generic nature, there is no telling how exactly people may be using it in the real world. lib/raid6: add ARM-NEON accelerated syndrome calculation This is a port of the RAID-6 checksumming code in altivec.uc ported to use NEON intrinsics. It is about 4x faster than the sequential code.
Diffstat (limited to 'arch/arm/include')
-rw-r--r--arch/arm/include/asm/neon.h36
-rw-r--r--arch/arm/include/asm/xor.h73
2 files changed, 109 insertions, 0 deletions
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
new file mode 100644
index 000000000000..8f730fe70093
--- /dev/null
+++ b/arch/arm/include/asm/neon.h
@@ -0,0 +1,36 @@
1/*
2 * linux/arch/arm/include/asm/neon.h
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/hwcap.h>
12
13#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
14
15#ifdef __ARM_NEON__
16
17/*
18 * If you are affected by the BUILD_BUG below, it probably means that you are
19 * using NEON code /and/ calling the kernel_neon_begin() function from the same
20 * compilation unit. To prevent issues that may arise from GCC reordering or
21 * generating(1) NEON instructions outside of these begin/end functions, the
22 * only supported way of using NEON code in the kernel is by isolating it in a
23 * separate compilation unit, and calling it from another unit from inside a
24 * kernel_neon_begin/kernel_neon_end pair.
25 *
26 * (1) Current GCC (4.7) might generate NEON instructions at O3 level if
27 * -mpfu=neon is set.
28 */
29
30#define kernel_neon_begin() \
31 BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
32
33#else
34void kernel_neon_begin(void);
35#endif
36void kernel_neon_end(void);
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 7604673dc427..4ffb26d4cad8 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -7,7 +7,10 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/hardirq.h>
10#include <asm-generic/xor.h> 11#include <asm-generic/xor.h>
12#include <asm/hwcap.h>
13#include <asm/neon.h>
11 14
12#define __XOR(a1, a2) a1 ^= a2 15#define __XOR(a1, a2) a1 ^= a2
13 16
@@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = {
138 xor_speed(&xor_block_arm4regs); \ 141 xor_speed(&xor_block_arm4regs); \
139 xor_speed(&xor_block_8regs); \ 142 xor_speed(&xor_block_8regs); \
140 xor_speed(&xor_block_32regs); \ 143 xor_speed(&xor_block_32regs); \
144 NEON_TEMPLATES; \
141 } while (0) 145 } while (0)
146
147#ifdef CONFIG_KERNEL_MODE_NEON
148
149extern struct xor_block_template const xor_block_neon_inner;
150
151static void
152xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
153{
154 if (in_interrupt()) {
155 xor_arm4regs_2(bytes, p1, p2);
156 } else {
157 kernel_neon_begin();
158 xor_block_neon_inner.do_2(bytes, p1, p2);
159 kernel_neon_end();
160 }
161}
162
163static void
164xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
165 unsigned long *p3)
166{
167 if (in_interrupt()) {
168 xor_arm4regs_3(bytes, p1, p2, p3);
169 } else {
170 kernel_neon_begin();
171 xor_block_neon_inner.do_3(bytes, p1, p2, p3);
172 kernel_neon_end();
173 }
174}
175
176static void
177xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
178 unsigned long *p3, unsigned long *p4)
179{
180 if (in_interrupt()) {
181 xor_arm4regs_4(bytes, p1, p2, p3, p4);
182 } else {
183 kernel_neon_begin();
184 xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
185 kernel_neon_end();
186 }
187}
188
189static void
190xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
191 unsigned long *p3, unsigned long *p4, unsigned long *p5)
192{
193 if (in_interrupt()) {
194 xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
195 } else {
196 kernel_neon_begin();
197 xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
198 kernel_neon_end();
199 }
200}
201
202static struct xor_block_template xor_block_neon = {
203 .name = "neon",
204 .do_2 = xor_neon_2,
205 .do_3 = xor_neon_3,
206 .do_4 = xor_neon_4,
207 .do_5 = xor_neon_5
208};
209
210#define NEON_TEMPLATES \
211 do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
212#else
213#define NEON_TEMPLATES
214#endif