aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorJim Kukunas <james.t.kukunas@linux.intel.com>2012-11-08 16:47:44 -0500
committerNeilBrown <neilb@suse.de>2012-12-13 00:42:01 -0500
commit7056741fd9fc14a65608549a4657cf5178f05f63 (patch)
treeb30504208f8261c4a0a2625169eaff9aa9de544e /lib
parent54f89341e8b8da0cdac8a7b873491739de19f098 (diff)
lib/raid6: Add AVX2 optimized recovery functions
Optimize RAID6 recovery functions to take advantage of the 256-bit YMM integer instructions introduced in AVX2. The patch was tested and benchmarked before submission. However hardware is not yet released so benchmark numbers cannot be reported. Acked-by: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'lib')
-rw-r--r--lib/raid6/Makefile2
-rw-r--r--lib/raid6/algos.c3
-rw-r--r--lib/raid6/recov_avx2.c327
-rw-r--r--lib/raid6/test/Makefile2
-rw-r--r--lib/raid6/x86.h14
5 files changed, 341 insertions, 7 deletions
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index de06dfe165b8..8c2e22bef661 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_RAID6_PQ) += raid6_pq.o 1obj-$(CONFIG_RAID6_PQ) += raid6_pq.o
2 2
3raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ 3raid6_pq-y += algos.o recov.o recov_ssse3.o recov_avx2.o tables.o int1.o int2.o int4.o \
4 int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ 4 int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
5 altivec8.o mmx.o sse1.o sse2.o 5 altivec8.o mmx.o sse1.o sse2.o
6hostprogs-y += mktables 6hostprogs-y += mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 589f5f50ad2e..8b7f55cadb45 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -72,6 +72,9 @@ EXPORT_SYMBOL_GPL(raid6_datap_recov);
72 72
73const struct raid6_recov_calls *const raid6_recov_algos[] = { 73const struct raid6_recov_calls *const raid6_recov_algos[] = {
74#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) 74#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
75#ifdef CONFIG_AS_AVX2
76 &raid6_recov_avx2,
77#endif
75 &raid6_recov_ssse3, 78 &raid6_recov_ssse3,
76#endif 79#endif
77 &raid6_recov_intx1, 80 &raid6_recov_intx1,
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
new file mode 100644
index 000000000000..43a9bab91879
--- /dev/null
+++ b/lib/raid6/recov_avx2.c
@@ -0,0 +1,327 @@
1/*
2 * Copyright (C) 2012 Intel Corporation
3 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10
11#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
12
13#if CONFIG_AS_AVX2
14
15#include <linux/raid/pq.h>
16#include "x86.h"
17
18static int raid6_has_avx2(void)
19{
20 return boot_cpu_has(X86_FEATURE_AVX2) &&
21 boot_cpu_has(X86_FEATURE_AVX);
22}
23
24static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
25 int failb, void **ptrs)
26{
27 u8 *p, *q, *dp, *dq;
28 const u8 *pbmul; /* P multiplier table for B data */
29 const u8 *qmul; /* Q multiplier table (for both) */
30 const u8 x0f = 0x0f;
31
32 p = (u8 *)ptrs[disks-2];
33 q = (u8 *)ptrs[disks-1];
34
35 /* Compute syndrome with zero for the missing data pages
36 Use the dead data pages as temporary storage for
37 delta p and delta q */
38 dp = (u8 *)ptrs[faila];
39 ptrs[faila] = (void *)raid6_empty_zero_page;
40 ptrs[disks-2] = dp;
41 dq = (u8 *)ptrs[failb];
42 ptrs[failb] = (void *)raid6_empty_zero_page;
43 ptrs[disks-1] = dq;
44
45 raid6_call.gen_syndrome(disks, bytes, ptrs);
46
47 /* Restore pointer table */
48 ptrs[faila] = dp;
49 ptrs[failb] = dq;
50 ptrs[disks-2] = p;
51 ptrs[disks-1] = q;
52
53 /* Now, pick the proper data tables */
54 pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
55 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
56 raid6_gfexp[failb]]];
57
58 kernel_fpu_begin();
59
60 /* ymm0 = x0f[16] */
61 asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
62
63 while (bytes) {
64#ifdef CONFIG_X86_64
65 asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
66 asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
67 asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
68 asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
69 asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
70 asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
71 asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
72 asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
73
74 /*
75 * 1 = dq[0] ^ q[0]
76 * 9 = dq[32] ^ q[32]
77 * 0 = dp[0] ^ p[0]
78 * 8 = dp[32] ^ p[32]
79 */
80
81 asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
82 asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
83
84 asm volatile("vpsraw $4, %ymm1, %ymm3");
85 asm volatile("vpsraw $4, %ymm9, %ymm12");
86 asm volatile("vpand %ymm7, %ymm1, %ymm1");
87 asm volatile("vpand %ymm7, %ymm9, %ymm9");
88 asm volatile("vpand %ymm7, %ymm3, %ymm3");
89 asm volatile("vpand %ymm7, %ymm12, %ymm12");
90 asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
91 asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
92 asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
93 asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
94 asm volatile("vpxor %ymm14, %ymm15, %ymm15");
95 asm volatile("vpxor %ymm4, %ymm5, %ymm5");
96
97 /*
98 * 5 = qx[0]
99 * 15 = qx[32]
100 */
101
102 asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
103 asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
104 asm volatile("vpsraw $4, %ymm0, %ymm2");
105 asm volatile("vpsraw $4, %ymm8, %ymm6");
106 asm volatile("vpand %ymm7, %ymm0, %ymm3");
107 asm volatile("vpand %ymm7, %ymm8, %ymm14");
108 asm volatile("vpand %ymm7, %ymm2, %ymm2");
109 asm volatile("vpand %ymm7, %ymm6, %ymm6");
110 asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
111 asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
112 asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
113 asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
114 asm volatile("vpxor %ymm4, %ymm1, %ymm1");
115 asm volatile("vpxor %ymm12, %ymm13, %ymm13");
116
117 /*
118 * 1 = pbmul[px[0]]
119 * 13 = pbmul[px[32]]
120 */
121 asm volatile("vpxor %ymm5, %ymm1, %ymm1");
122 asm volatile("vpxor %ymm15, %ymm13, %ymm13");
123
124 /*
125 * 1 = db = DQ
126 * 13 = db[32] = DQ[32]
127 */
128 asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
129 asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
130 asm volatile("vpxor %ymm1, %ymm0, %ymm0");
131 asm volatile("vpxor %ymm13, %ymm8, %ymm8");
132
133 asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
134 asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
135
136 bytes -= 64;
137 p += 64;
138 q += 64;
139 dp += 64;
140 dq += 64;
141#else
142 asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
143 asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
144 asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
145 asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
146
147 /* 1 = dq ^ q; 0 = dp ^ p */
148
149 asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
150 asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
151
152 /*
153 * 1 = dq ^ q
154 * 3 = dq ^ p >> 4
155 */
156 asm volatile("vpsraw $4, %ymm1, %ymm3");
157 asm volatile("vpand %ymm7, %ymm1, %ymm1");
158 asm volatile("vpand %ymm7, %ymm3, %ymm3");
159 asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
160 asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
161 asm volatile("vpxor %ymm4, %ymm5, %ymm5");
162
163 /* 5 = qx */
164
165 asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
166 asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
167
168 asm volatile("vpsraw $4, %ymm0, %ymm2");
169 asm volatile("vpand %ymm7, %ymm0, %ymm3");
170 asm volatile("vpand %ymm7, %ymm2, %ymm2");
171 asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
172 asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
173 asm volatile("vpxor %ymm4, %ymm1, %ymm1");
174
175 /* 1 = pbmul[px] */
176 asm volatile("vpxor %ymm5, %ymm1, %ymm1");
177 /* 1 = db = DQ */
178 asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
179
180 asm volatile("vpxor %ymm1, %ymm0, %ymm0");
181 asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
182
183 bytes -= 32;
184 p += 32;
185 q += 32;
186 dp += 32;
187 dq += 32;
188#endif
189 }
190
191 kernel_fpu_end();
192}
193
194static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
195 void **ptrs)
196{
197 u8 *p, *q, *dq;
198 const u8 *qmul; /* Q multiplier table */
199 const u8 x0f = 0x0f;
200
201 p = (u8 *)ptrs[disks-2];
202 q = (u8 *)ptrs[disks-1];
203
204 /* Compute syndrome with zero for the missing data page
205 Use the dead data page as temporary storage for delta q */
206 dq = (u8 *)ptrs[faila];
207 ptrs[faila] = (void *)raid6_empty_zero_page;
208 ptrs[disks-1] = dq;
209
210 raid6_call.gen_syndrome(disks, bytes, ptrs);
211
212 /* Restore pointer table */
213 ptrs[faila] = dq;
214 ptrs[disks-1] = q;
215
216 /* Now, pick the proper data tables */
217 qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
218
219 kernel_fpu_begin();
220
221 asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
222
223 while (bytes) {
224#ifdef CONFIG_X86_64
225 asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
226 asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
227 asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
228 asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
229
230 /*
231 * 3 = q[0] ^ dq[0]
232 * 8 = q[32] ^ dq[32]
233 */
234 asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
235 asm volatile("vmovapd %ymm0, %ymm13");
236 asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
237 asm volatile("vmovapd %ymm1, %ymm14");
238
239 asm volatile("vpsraw $4, %ymm3, %ymm6");
240 asm volatile("vpsraw $4, %ymm8, %ymm12");
241 asm volatile("vpand %ymm7, %ymm3, %ymm3");
242 asm volatile("vpand %ymm7, %ymm8, %ymm8");
243 asm volatile("vpand %ymm7, %ymm6, %ymm6");
244 asm volatile("vpand %ymm7, %ymm12, %ymm12");
245 asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
246 asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
247 asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
248 asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
249 asm volatile("vpxor %ymm0, %ymm1, %ymm1");
250 asm volatile("vpxor %ymm13, %ymm14, %ymm14");
251
252 /*
253 * 1 = qmul[q[0] ^ dq[0]]
254 * 14 = qmul[q[32] ^ dq[32]]
255 */
256 asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
257 asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
258 asm volatile("vpxor %ymm1, %ymm2, %ymm2");
259 asm volatile("vpxor %ymm14, %ymm12, %ymm12");
260
261 /*
262 * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
263 * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
264 */
265
266 asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
267 asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
268 asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
269 asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
270
271 bytes -= 64;
272 p += 64;
273 q += 64;
274 dq += 64;
275#else
276 asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
277 asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
278
279 /* 3 = q ^ dq */
280
281 asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
282 asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
283
284 asm volatile("vpsraw $4, %ymm3, %ymm6");
285 asm volatile("vpand %ymm7, %ymm3, %ymm3");
286 asm volatile("vpand %ymm7, %ymm6, %ymm6");
287 asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
288 asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
289 asm volatile("vpxor %ymm0, %ymm1, %ymm1");
290
291 /* 1 = qmul[q ^ dq] */
292
293 asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
294 asm volatile("vpxor %ymm1, %ymm2, %ymm2");
295
296 /* 2 = p ^ qmul[q ^ dq] */
297
298 asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
299 asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
300
301 bytes -= 32;
302 p += 32;
303 q += 32;
304 dq += 32;
305#endif
306 }
307
308 kernel_fpu_end();
309}
310
311const struct raid6_recov_calls raid6_recov_avx2 = {
312 .data2 = raid6_2data_recov_avx2,
313 .datap = raid6_datap_recov_avx2,
314 .valid = raid6_has_avx2,
315#ifdef CONFIG_X86_64
316 .name = "avx2x2",
317#else
318 .name = "avx2x1",
319#endif
320 .priority = 2,
321};
322
323#else
324#warning "your version of binutils lacks AVX2 support"
325#endif
326
327#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index c76151d94764..d919c98ce266 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB = ranlib
23all: raid6.a raid6test 23all: raid6.a raid6test
24 24
25raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ 25raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
26 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ 26 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o recov_avx2.o algos.o \
27 tables.o 27 tables.o
28 rm -f $@ 28 rm -f $@
29 $(AR) cq $@ $^ 29 $(AR) cq $@ $^
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index d55d63232c55..b7595484a815 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -45,19 +45,23 @@ static inline void kernel_fpu_end(void)
45#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 45#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
46#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ 46#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
47#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 47#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
48#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
48#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ 49#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
49 50
50/* Should work well enough on modern CPUs for testing */ 51/* Should work well enough on modern CPUs for testing */
51static inline int boot_cpu_has(int flag) 52static inline int boot_cpu_has(int flag)
52{ 53{
53 u32 eax = (flag & 0x20) ? 0x80000001 : 1; 54 u32 eax, ebx, ecx, edx;
54 u32 ecx, edx; 55
56 eax = (flag & 0x100) ? 7 :
57 (flag & 0x20) ? 0x80000001 : 1;
58 ecx = 0;
55 59
56 asm volatile("cpuid" 60 asm volatile("cpuid"
57 : "+a" (eax), "=d" (edx), "=c" (ecx) 61 : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx));
58 : : "ebx");
59 62
60 return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; 63 return ((flag & 0x100 ? ebx :
64 (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1;
61} 65}
62 66
63#endif /* ndef __KERNEL__ */ 67#endif /* ndef __KERNEL__ */