diff options
author | Anton Blanchard <anton@samba.org> | 2013-10-14 06:03:58 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2013-10-30 01:02:28 -0400 |
commit | ef1313deafb7baa6d3382044e962d5ad5e8c8dd6 (patch) | |
tree | 30584552d8b2907bb8360a7d4e1cab28e3215585 /arch/powerpc/lib | |
parent | 07fb41a7525539d7ad37c25f2a2689fd95a6ab68 (diff) |
powerpc: Add VMX optimised xor for RAID5
Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade
this is a decent win:
32regs : 17932.800 MB/sec
altivec : 19724.800 MB/sec
The bigger gain is when the same test is run in SMT4 mode, as it
would if there was a lot of work going on:
8regs : 8377.600 MB/sec
altivec : 15801.600 MB/sec
I tested this against an array created without the patch, and also
verified it worked as expected on a little endian kernel.
[ Fix !CONFIG_ALTIVEC build -- BenH ]
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/Makefile | 3 | ||||
-rw-r--r-- | arch/powerpc/lib/xor_vmx.c | 177 |
2 files changed, 180 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 5310132856c1..95a20e17dbff 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile | |||
@@ -39,3 +39,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o | |||
39 | obj-y += code-patching.o | 39 | obj-y += code-patching.o |
40 | obj-y += feature-fixups.o | 40 | obj-y += feature-fixups.o |
41 | obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o | 41 | obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o |
42 | |||
43 | obj-$(CONFIG_ALTIVEC) += xor_vmx.o | ||
44 | CFLAGS_xor_vmx.o += -maltivec -mabi=altivec | ||
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 000000000000..e905f7c2ea7b --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c | |||
@@ -0,0 +1,177 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License, or | ||
5 | * (at your option) any later version. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, | ||
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
10 | * GNU General Public License for more details. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * along with this program; if not, write to the Free Software | ||
14 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
15 | * | ||
16 | * Copyright (C) IBM Corporation, 2012 | ||
17 | * | ||
18 | * Author: Anton Blanchard <anton@au.ibm.com> | ||
19 | */ | ||
20 | #include <altivec.h> | ||
21 | |||
22 | #include <linux/preempt.h> | ||
23 | #include <linux/export.h> | ||
24 | #include <linux/sched.h> | ||
25 | #include <asm/switch_to.h> | ||
26 | |||
27 | typedef vector signed char unative_t; | ||
28 | |||
29 | #define DEFINE(V) \ | ||
30 | unative_t *V = (unative_t *)V##_in; \ | ||
31 | unative_t V##_0, V##_1, V##_2, V##_3 | ||
32 | |||
33 | #define LOAD(V) \ | ||
34 | do { \ | ||
35 | V##_0 = V[0]; \ | ||
36 | V##_1 = V[1]; \ | ||
37 | V##_2 = V[2]; \ | ||
38 | V##_3 = V[3]; \ | ||
39 | } while (0) | ||
40 | |||
41 | #define STORE(V) \ | ||
42 | do { \ | ||
43 | V[0] = V##_0; \ | ||
44 | V[1] = V##_1; \ | ||
45 | V[2] = V##_2; \ | ||
46 | V[3] = V##_3; \ | ||
47 | } while (0) | ||
48 | |||
49 | #define XOR(V1, V2) \ | ||
50 | do { \ | ||
51 | V1##_0 = vec_xor(V1##_0, V2##_0); \ | ||
52 | V1##_1 = vec_xor(V1##_1, V2##_1); \ | ||
53 | V1##_2 = vec_xor(V1##_2, V2##_2); \ | ||
54 | V1##_3 = vec_xor(V1##_3, V2##_3); \ | ||
55 | } while (0) | ||
56 | |||
57 | void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, | ||
58 | unsigned long *v2_in) | ||
59 | { | ||
60 | DEFINE(v1); | ||
61 | DEFINE(v2); | ||
62 | unsigned long lines = bytes / (sizeof(unative_t)) / 4; | ||
63 | |||
64 | preempt_disable(); | ||
65 | enable_kernel_altivec(); | ||
66 | |||
67 | do { | ||
68 | LOAD(v1); | ||
69 | LOAD(v2); | ||
70 | XOR(v1, v2); | ||
71 | STORE(v1); | ||
72 | |||
73 | v1 += 4; | ||
74 | v2 += 4; | ||
75 | } while (--lines > 0); | ||
76 | |||
77 | preempt_enable(); | ||
78 | } | ||
79 | EXPORT_SYMBOL(xor_altivec_2); | ||
80 | |||
81 | void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, | ||
82 | unsigned long *v2_in, unsigned long *v3_in) | ||
83 | { | ||
84 | DEFINE(v1); | ||
85 | DEFINE(v2); | ||
86 | DEFINE(v3); | ||
87 | unsigned long lines = bytes / (sizeof(unative_t)) / 4; | ||
88 | |||
89 | preempt_disable(); | ||
90 | enable_kernel_altivec(); | ||
91 | |||
92 | do { | ||
93 | LOAD(v1); | ||
94 | LOAD(v2); | ||
95 | LOAD(v3); | ||
96 | XOR(v1, v2); | ||
97 | XOR(v1, v3); | ||
98 | STORE(v1); | ||
99 | |||
100 | v1 += 4; | ||
101 | v2 += 4; | ||
102 | v3 += 4; | ||
103 | } while (--lines > 0); | ||
104 | |||
105 | preempt_enable(); | ||
106 | } | ||
107 | EXPORT_SYMBOL(xor_altivec_3); | ||
108 | |||
109 | void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, | ||
110 | unsigned long *v2_in, unsigned long *v3_in, | ||
111 | unsigned long *v4_in) | ||
112 | { | ||
113 | DEFINE(v1); | ||
114 | DEFINE(v2); | ||
115 | DEFINE(v3); | ||
116 | DEFINE(v4); | ||
117 | unsigned long lines = bytes / (sizeof(unative_t)) / 4; | ||
118 | |||
119 | preempt_disable(); | ||
120 | enable_kernel_altivec(); | ||
121 | |||
122 | do { | ||
123 | LOAD(v1); | ||
124 | LOAD(v2); | ||
125 | LOAD(v3); | ||
126 | LOAD(v4); | ||
127 | XOR(v1, v2); | ||
128 | XOR(v3, v4); | ||
129 | XOR(v1, v3); | ||
130 | STORE(v1); | ||
131 | |||
132 | v1 += 4; | ||
133 | v2 += 4; | ||
134 | v3 += 4; | ||
135 | v4 += 4; | ||
136 | } while (--lines > 0); | ||
137 | |||
138 | preempt_enable(); | ||
139 | } | ||
140 | EXPORT_SYMBOL(xor_altivec_4); | ||
141 | |||
142 | void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, | ||
143 | unsigned long *v2_in, unsigned long *v3_in, | ||
144 | unsigned long *v4_in, unsigned long *v5_in) | ||
145 | { | ||
146 | DEFINE(v1); | ||
147 | DEFINE(v2); | ||
148 | DEFINE(v3); | ||
149 | DEFINE(v4); | ||
150 | DEFINE(v5); | ||
151 | unsigned long lines = bytes / (sizeof(unative_t)) / 4; | ||
152 | |||
153 | preempt_disable(); | ||
154 | enable_kernel_altivec(); | ||
155 | |||
156 | do { | ||
157 | LOAD(v1); | ||
158 | LOAD(v2); | ||
159 | LOAD(v3); | ||
160 | LOAD(v4); | ||
161 | LOAD(v5); | ||
162 | XOR(v1, v2); | ||
163 | XOR(v3, v4); | ||
164 | XOR(v1, v5); | ||
165 | XOR(v1, v3); | ||
166 | STORE(v1); | ||
167 | |||
168 | v1 += 4; | ||
169 | v2 += 4; | ||
170 | v3 += 4; | ||
171 | v4 += 4; | ||
172 | v5 += 4; | ||
173 | } while (--lines > 0); | ||
174 | |||
175 | preempt_enable(); | ||
176 | } | ||
177 | EXPORT_SYMBOL(xor_altivec_5); | ||