diff options
Diffstat (limited to 'lib/raid6/avx512.c')
-rw-r--r-- | lib/raid6/avx512.c | 569 |
1 files changed, 569 insertions, 0 deletions
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..f524a7972006 --- /dev/null +++ b/lib/raid6/avx512.c | |||
@@ -0,0 +1,569 @@ | |||
1 | /* -*- linux-c -*- -------------------------------------------------------- | ||
2 | * | ||
3 | * Copyright (C) 2016 Intel Corporation | ||
4 | * | ||
5 | * Author: Gayatri Kammela <gayatri.kammela@intel.com> | ||
6 | * Author: Megha Dey <megha.dey@linux.intel.com> | ||
7 | * | ||
8 | * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved | ||
9 | * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
14 | * Boston MA 02111-1307, USA; either version 2 of the License, or | ||
15 | * (at your option) any later version; incorporated herein by reference. | ||
16 | * | ||
17 | * ----------------------------------------------------------------------- | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * AVX512 implementation of RAID-6 syndrome functions | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifdef CONFIG_AS_AVX512 | ||
26 | |||
27 | #include <linux/raid/pq.h> | ||
28 | #include "x86.h" | ||
29 | |||
30 | static const struct raid6_avx512_constants { | ||
31 | u64 x1d[8]; | ||
32 | } raid6_avx512_constants __aligned(512) = { | ||
33 | { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, | ||
34 | 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, | ||
35 | 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, | ||
36 | 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, | ||
37 | }; | ||
38 | |||
39 | static int raid6_have_avx512(void) | ||
40 | { | ||
41 | return boot_cpu_has(X86_FEATURE_AVX2) && | ||
42 | boot_cpu_has(X86_FEATURE_AVX) && | ||
43 | boot_cpu_has(X86_FEATURE_AVX512F) && | ||
44 | boot_cpu_has(X86_FEATURE_AVX512BW) && | ||
45 | boot_cpu_has(X86_FEATURE_AVX512VL) && | ||
46 | boot_cpu_has(X86_FEATURE_AVX512DQ); | ||
47 | } | ||
48 | |||
49 | static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
50 | { | ||
51 | u8 **dptr = (u8 **)ptrs; | ||
52 | u8 *p, *q; | ||
53 | int d, z, z0; | ||
54 | |||
55 | z0 = disks - 3; /* Highest data disk */ | ||
56 | p = dptr[z0+1]; /* XOR parity */ | ||
57 | q = dptr[z0+2]; /* RS syndrome */ | ||
58 | |||
59 | kernel_fpu_begin(); | ||
60 | |||
61 | asm volatile("vmovdqa64 %0,%%zmm0\n\t" | ||
62 | "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ | ||
63 | : | ||
64 | : "m" (raid6_avx512_constants.x1d[0])); | ||
65 | |||
66 | for (d = 0; d < bytes; d += 64) { | ||
67 | asm volatile("prefetchnta %0\n\t" | ||
68 | "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ | ||
69 | "prefetchnta %1\n\t" | ||
70 | "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ | ||
71 | "vmovdqa64 %1,%%zmm6" | ||
72 | : | ||
73 | : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); | ||
74 | for (z = z0-2; z >= 0; z--) { | ||
75 | asm volatile("prefetchnta %0\n\t" | ||
76 | "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" | ||
77 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
78 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
79 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
80 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
81 | "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" | ||
82 | "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" | ||
83 | "vmovdqa64 %0,%%zmm6" | ||
84 | : | ||
85 | : "m" (dptr[z][d])); | ||
86 | } | ||
87 | asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" | ||
88 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
89 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
90 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
91 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
92 | "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" | ||
93 | "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" | ||
94 | "vmovntdq %%zmm2,%0\n\t" | ||
95 | "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" | ||
96 | "vmovntdq %%zmm4,%1\n\t" | ||
97 | "vpxorq %%zmm4,%%zmm4,%%zmm4" | ||
98 | : | ||
99 | : "m" (p[d]), "m" (q[d])); | ||
100 | } | ||
101 | |||
102 | asm volatile("sfence" : : : "memory"); | ||
103 | kernel_fpu_end(); | ||
104 | } | ||
105 | |||
106 | static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, | ||
107 | size_t bytes, void **ptrs) | ||
108 | { | ||
109 | u8 **dptr = (u8 **)ptrs; | ||
110 | u8 *p, *q; | ||
111 | int d, z, z0; | ||
112 | |||
113 | z0 = stop; /* P/Q right side optimization */ | ||
114 | p = dptr[disks-2]; /* XOR parity */ | ||
115 | q = dptr[disks-1]; /* RS syndrome */ | ||
116 | |||
117 | kernel_fpu_begin(); | ||
118 | |||
119 | asm volatile("vmovdqa64 %0,%%zmm0" | ||
120 | : : "m" (raid6_avx512_constants.x1d[0])); | ||
121 | |||
122 | for (d = 0 ; d < bytes ; d += 64) { | ||
123 | asm volatile("vmovdqa64 %0,%%zmm4\n\t" | ||
124 | "vmovdqa64 %1,%%zmm2\n\t" | ||
125 | "vpxorq %%zmm4,%%zmm2,%%zmm2" | ||
126 | : | ||
127 | : "m" (dptr[z0][d]), "m" (p[d])); | ||
128 | /* P/Q data pages */ | ||
129 | for (z = z0-1 ; z >= start ; z--) { | ||
130 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
131 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
132 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
133 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
134 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
135 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
136 | "vmovdqa64 %0,%%zmm5\n\t" | ||
137 | "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" | ||
138 | "vpxorq %%zmm5,%%zmm4,%%zmm4" | ||
139 | : | ||
140 | : "m" (dptr[z][d])); | ||
141 | } | ||
142 | /* P/Q left side optimization */ | ||
143 | for (z = start-1 ; z >= 0 ; z--) { | ||
144 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
145 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
146 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
147 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
148 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
149 | "vpxorq %%zmm5,%%zmm4,%%zmm4" | ||
150 | : | ||
151 | : ); | ||
152 | } | ||
153 | asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" | ||
154 | /* Don't use movntdq for r/w memory area < cache line */ | ||
155 | "vmovdqa64 %%zmm4,%0\n\t" | ||
156 | "vmovdqa64 %%zmm2,%1" | ||
157 | : | ||
158 | : "m" (q[d]), "m" (p[d])); | ||
159 | } | ||
160 | |||
161 | asm volatile("sfence" : : : "memory"); | ||
162 | kernel_fpu_end(); | ||
163 | } | ||
164 | |||
165 | const struct raid6_calls raid6_avx512x1 = { | ||
166 | raid6_avx5121_gen_syndrome, | ||
167 | raid6_avx5121_xor_syndrome, | ||
168 | raid6_have_avx512, | ||
169 | "avx512x1", | ||
170 | 1 /* Has cache hints */ | ||
171 | }; | ||
172 | |||
173 | /* | ||
174 | * Unrolled-by-2 AVX512 implementation | ||
175 | */ | ||
176 | static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
177 | { | ||
178 | u8 **dptr = (u8 **)ptrs; | ||
179 | u8 *p, *q; | ||
180 | int d, z, z0; | ||
181 | |||
182 | z0 = disks - 3; /* Highest data disk */ | ||
183 | p = dptr[z0+1]; /* XOR parity */ | ||
184 | q = dptr[z0+2]; /* RS syndrome */ | ||
185 | |||
186 | kernel_fpu_begin(); | ||
187 | |||
188 | asm volatile("vmovdqa64 %0,%%zmm0\n\t" | ||
189 | "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ | ||
190 | : | ||
191 | : "m" (raid6_avx512_constants.x1d[0])); | ||
192 | |||
193 | /* We uniformly assume a single prefetch covers at least 64 bytes */ | ||
194 | for (d = 0; d < bytes; d += 128) { | ||
195 | asm volatile("prefetchnta %0\n\t" | ||
196 | "prefetchnta %1\n\t" | ||
197 | "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ | ||
198 | "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ | ||
199 | "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ | ||
200 | "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ | ||
201 | : | ||
202 | : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); | ||
203 | for (z = z0-1; z >= 0; z--) { | ||
204 | asm volatile("prefetchnta %0\n\t" | ||
205 | "prefetchnta %1\n\t" | ||
206 | "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" | ||
207 | "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" | ||
208 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
209 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
210 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
211 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
212 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
213 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
214 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
215 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
216 | "vmovdqa64 %0,%%zmm5\n\t" | ||
217 | "vmovdqa64 %1,%%zmm7\n\t" | ||
218 | "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" | ||
219 | "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" | ||
220 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
221 | "vpxorq %%zmm7,%%zmm6,%%zmm6" | ||
222 | : | ||
223 | : "m" (dptr[z][d]), "m" (dptr[z][d+64])); | ||
224 | } | ||
225 | asm volatile("vmovntdq %%zmm2,%0\n\t" | ||
226 | "vmovntdq %%zmm3,%1\n\t" | ||
227 | "vmovntdq %%zmm4,%2\n\t" | ||
228 | "vmovntdq %%zmm6,%3" | ||
229 | : | ||
230 | : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), | ||
231 | "m" (q[d+64])); | ||
232 | } | ||
233 | |||
234 | asm volatile("sfence" : : : "memory"); | ||
235 | kernel_fpu_end(); | ||
236 | } | ||
237 | |||
238 | static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, | ||
239 | size_t bytes, void **ptrs) | ||
240 | { | ||
241 | u8 **dptr = (u8 **)ptrs; | ||
242 | u8 *p, *q; | ||
243 | int d, z, z0; | ||
244 | |||
245 | z0 = stop; /* P/Q right side optimization */ | ||
246 | p = dptr[disks-2]; /* XOR parity */ | ||
247 | q = dptr[disks-1]; /* RS syndrome */ | ||
248 | |||
249 | kernel_fpu_begin(); | ||
250 | |||
251 | asm volatile("vmovdqa64 %0,%%zmm0" | ||
252 | : : "m" (raid6_avx512_constants.x1d[0])); | ||
253 | |||
254 | for (d = 0 ; d < bytes ; d += 128) { | ||
255 | asm volatile("vmovdqa64 %0,%%zmm4\n\t" | ||
256 | "vmovdqa64 %1,%%zmm6\n\t" | ||
257 | "vmovdqa64 %2,%%zmm2\n\t" | ||
258 | "vmovdqa64 %3,%%zmm3\n\t" | ||
259 | "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" | ||
260 | "vpxorq %%zmm6,%%zmm3,%%zmm3" | ||
261 | : | ||
262 | : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), | ||
263 | "m" (p[d]), "m" (p[d+64])); | ||
264 | /* P/Q data pages */ | ||
265 | for (z = z0-1 ; z >= start ; z--) { | ||
266 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
267 | "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" | ||
268 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
269 | "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" | ||
270 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
271 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
272 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
273 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
274 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
275 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
276 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
277 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
278 | "vmovdqa64 %0,%%zmm5\n\t" | ||
279 | "vmovdqa64 %1,%%zmm7\n\t" | ||
280 | "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" | ||
281 | "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" | ||
282 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
283 | "vpxorq %%zmm7,%%zmm6,%%zmm6" | ||
284 | : | ||
285 | : "m" (dptr[z][d]), "m" (dptr[z][d+64])); | ||
286 | } | ||
287 | /* P/Q left side optimization */ | ||
288 | for (z = start-1 ; z >= 0 ; z--) { | ||
289 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
290 | "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" | ||
291 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
292 | "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" | ||
293 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
294 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
295 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
296 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
297 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
298 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
299 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
300 | "vpxorq %%zmm7,%%zmm6,%%zmm6" | ||
301 | : | ||
302 | : ); | ||
303 | } | ||
304 | asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" | ||
305 | "vpxorq %1,%%zmm6,%%zmm6\n\t" | ||
306 | /* Don't use movntdq for r/w | ||
307 | * memory area < cache line | ||
308 | */ | ||
309 | "vmovdqa64 %%zmm4,%0\n\t" | ||
310 | "vmovdqa64 %%zmm6,%1\n\t" | ||
311 | "vmovdqa64 %%zmm2,%2\n\t" | ||
312 | "vmovdqa64 %%zmm3,%3" | ||
313 | : | ||
314 | : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), | ||
315 | "m" (p[d+64])); | ||
316 | } | ||
317 | |||
318 | asm volatile("sfence" : : : "memory"); | ||
319 | kernel_fpu_end(); | ||
320 | } | ||
321 | |||
322 | const struct raid6_calls raid6_avx512x2 = { | ||
323 | raid6_avx5122_gen_syndrome, | ||
324 | raid6_avx5122_xor_syndrome, | ||
325 | raid6_have_avx512, | ||
326 | "avx512x2", | ||
327 | 1 /* Has cache hints */ | ||
328 | }; | ||
329 | |||
330 | #ifdef CONFIG_X86_64 | ||
331 | |||
332 | /* | ||
333 | * Unrolled-by-4 AVX2 implementation | ||
334 | */ | ||
335 | static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
336 | { | ||
337 | u8 **dptr = (u8 **)ptrs; | ||
338 | u8 *p, *q; | ||
339 | int d, z, z0; | ||
340 | |||
341 | z0 = disks - 3; /* Highest data disk */ | ||
342 | p = dptr[z0+1]; /* XOR parity */ | ||
343 | q = dptr[z0+2]; /* RS syndrome */ | ||
344 | |||
345 | kernel_fpu_begin(); | ||
346 | |||
347 | asm volatile("vmovdqa64 %0,%%zmm0\n\t" | ||
348 | "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ | ||
349 | "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ | ||
350 | "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ | ||
351 | "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ | ||
352 | "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ | ||
353 | "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ | ||
354 | "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ | ||
355 | "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ | ||
356 | "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ | ||
357 | : | ||
358 | : "m" (raid6_avx512_constants.x1d[0])); | ||
359 | |||
360 | for (d = 0; d < bytes; d += 256) { | ||
361 | for (z = z0; z >= 0; z--) { | ||
362 | asm volatile("prefetchnta %0\n\t" | ||
363 | "prefetchnta %1\n\t" | ||
364 | "prefetchnta %2\n\t" | ||
365 | "prefetchnta %3\n\t" | ||
366 | "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" | ||
367 | "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" | ||
368 | "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" | ||
369 | "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" | ||
370 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
371 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
372 | "vpmovm2b %%k3,%%zmm13\n\t" | ||
373 | "vpmovm2b %%k4,%%zmm15\n\t" | ||
374 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
375 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
376 | "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" | ||
377 | "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" | ||
378 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
379 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
380 | "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" | ||
381 | "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" | ||
382 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
383 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
384 | "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" | ||
385 | "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" | ||
386 | "vmovdqa64 %0,%%zmm5\n\t" | ||
387 | "vmovdqa64 %1,%%zmm7\n\t" | ||
388 | "vmovdqa64 %2,%%zmm13\n\t" | ||
389 | "vmovdqa64 %3,%%zmm15\n\t" | ||
390 | "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" | ||
391 | "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" | ||
392 | "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" | ||
393 | "vpxorq %%zmm15,%%zmm11,%%zmm11\n" | ||
394 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
395 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
396 | "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" | ||
397 | "vpxorq %%zmm15,%%zmm14,%%zmm14" | ||
398 | : | ||
399 | : "m" (dptr[z][d]), "m" (dptr[z][d+64]), | ||
400 | "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); | ||
401 | } | ||
402 | asm volatile("vmovntdq %%zmm2,%0\n\t" | ||
403 | "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" | ||
404 | "vmovntdq %%zmm3,%1\n\t" | ||
405 | "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" | ||
406 | "vmovntdq %%zmm10,%2\n\t" | ||
407 | "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" | ||
408 | "vmovntdq %%zmm11,%3\n\t" | ||
409 | "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" | ||
410 | "vmovntdq %%zmm4,%4\n\t" | ||
411 | "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" | ||
412 | "vmovntdq %%zmm6,%5\n\t" | ||
413 | "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" | ||
414 | "vmovntdq %%zmm12,%6\n\t" | ||
415 | "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" | ||
416 | "vmovntdq %%zmm14,%7\n\t" | ||
417 | "vpxorq %%zmm14,%%zmm14,%%zmm14" | ||
418 | : | ||
419 | : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), | ||
420 | "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), | ||
421 | "m" (q[d+128]), "m" (q[d+192])); | ||
422 | } | ||
423 | |||
424 | asm volatile("sfence" : : : "memory"); | ||
425 | kernel_fpu_end(); | ||
426 | } | ||
427 | |||
428 | static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, | ||
429 | size_t bytes, void **ptrs) | ||
430 | { | ||
431 | u8 **dptr = (u8 **)ptrs; | ||
432 | u8 *p, *q; | ||
433 | int d, z, z0; | ||
434 | |||
435 | z0 = stop; /* P/Q right side optimization */ | ||
436 | p = dptr[disks-2]; /* XOR parity */ | ||
437 | q = dptr[disks-1]; /* RS syndrome */ | ||
438 | |||
439 | kernel_fpu_begin(); | ||
440 | |||
441 | asm volatile("vmovdqa64 %0,%%zmm0" | ||
442 | :: "m" (raid6_avx512_constants.x1d[0])); | ||
443 | |||
444 | for (d = 0 ; d < bytes ; d += 256) { | ||
445 | asm volatile("vmovdqa64 %0,%%zmm4\n\t" | ||
446 | "vmovdqa64 %1,%%zmm6\n\t" | ||
447 | "vmovdqa64 %2,%%zmm12\n\t" | ||
448 | "vmovdqa64 %3,%%zmm14\n\t" | ||
449 | "vmovdqa64 %4,%%zmm2\n\t" | ||
450 | "vmovdqa64 %5,%%zmm3\n\t" | ||
451 | "vmovdqa64 %6,%%zmm10\n\t" | ||
452 | "vmovdqa64 %7,%%zmm11\n\t" | ||
453 | "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" | ||
454 | "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" | ||
455 | "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" | ||
456 | "vpxorq %%zmm14,%%zmm11,%%zmm11" | ||
457 | : | ||
458 | : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), | ||
459 | "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), | ||
460 | "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), | ||
461 | "m" (p[d+192])); | ||
462 | /* P/Q data pages */ | ||
463 | for (z = z0-1 ; z >= start ; z--) { | ||
464 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
465 | "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" | ||
466 | "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" | ||
467 | "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" | ||
468 | "prefetchnta %0\n\t" | ||
469 | "prefetchnta %2\n\t" | ||
470 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
471 | "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" | ||
472 | "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" | ||
473 | "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" | ||
474 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
475 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
476 | "vpmovm2b %%k3,%%zmm13\n\t" | ||
477 | "vpmovm2b %%k4,%%zmm15\n\t" | ||
478 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
479 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
480 | "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" | ||
481 | "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" | ||
482 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
483 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
484 | "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" | ||
485 | "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" | ||
486 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
487 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
488 | "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" | ||
489 | "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" | ||
490 | "vmovdqa64 %0,%%zmm5\n\t" | ||
491 | "vmovdqa64 %1,%%zmm7\n\t" | ||
492 | "vmovdqa64 %2,%%zmm13\n\t" | ||
493 | "vmovdqa64 %3,%%zmm15\n\t" | ||
494 | "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" | ||
495 | "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" | ||
496 | "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" | ||
497 | "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" | ||
498 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
499 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
500 | "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" | ||
501 | "vpxorq %%zmm15,%%zmm14,%%zmm14" | ||
502 | : | ||
503 | : "m" (dptr[z][d]), "m" (dptr[z][d+64]), | ||
504 | "m" (dptr[z][d+128]), | ||
505 | "m" (dptr[z][d+192])); | ||
506 | } | ||
507 | asm volatile("prefetchnta %0\n\t" | ||
508 | "prefetchnta %1\n\t" | ||
509 | : | ||
510 | : "m" (q[d]), "m" (q[d+128])); | ||
511 | /* P/Q left side optimization */ | ||
512 | for (z = start-1 ; z >= 0 ; z--) { | ||
513 | asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" | ||
514 | "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" | ||
515 | "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" | ||
516 | "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" | ||
517 | "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" | ||
518 | "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" | ||
519 | "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" | ||
520 | "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" | ||
521 | "vpmovm2b %%k1,%%zmm5\n\t" | ||
522 | "vpmovm2b %%k2,%%zmm7\n\t" | ||
523 | "vpmovm2b %%k3,%%zmm13\n\t" | ||
524 | "vpmovm2b %%k4,%%zmm15\n\t" | ||
525 | "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" | ||
526 | "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" | ||
527 | "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" | ||
528 | "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" | ||
529 | "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" | ||
530 | "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" | ||
531 | "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" | ||
532 | "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" | ||
533 | "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" | ||
534 | "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" | ||
535 | "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" | ||
536 | "vpxorq %%zmm15,%%zmm14,%%zmm14" | ||
537 | : | ||
538 | : ); | ||
539 | } | ||
540 | asm volatile("vmovntdq %%zmm2,%0\n\t" | ||
541 | "vmovntdq %%zmm3,%1\n\t" | ||
542 | "vmovntdq %%zmm10,%2\n\t" | ||
543 | "vmovntdq %%zmm11,%3\n\t" | ||
544 | "vpxorq %4,%%zmm4,%%zmm4\n\t" | ||
545 | "vpxorq %5,%%zmm6,%%zmm6\n\t" | ||
546 | "vpxorq %6,%%zmm12,%%zmm12\n\t" | ||
547 | "vpxorq %7,%%zmm14,%%zmm14\n\t" | ||
548 | "vmovntdq %%zmm4,%4\n\t" | ||
549 | "vmovntdq %%zmm6,%5\n\t" | ||
550 | "vmovntdq %%zmm12,%6\n\t" | ||
551 | "vmovntdq %%zmm14,%7" | ||
552 | : | ||
553 | : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), | ||
554 | "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), | ||
555 | "m" (q[d+128]), "m" (q[d+192])); | ||
556 | } | ||
557 | asm volatile("sfence" : : : "memory"); | ||
558 | kernel_fpu_end(); | ||
559 | } | ||
560 | const struct raid6_calls raid6_avx512x4 = { | ||
561 | raid6_avx5124_gen_syndrome, | ||
562 | raid6_avx5124_xor_syndrome, | ||
563 | raid6_have_avx512, | ||
564 | "avx512x4", | ||
565 | 1 /* Has cache hints */ | ||
566 | }; | ||
567 | #endif | ||
568 | |||
569 | #endif /* CONFIG_AS_AVX512 */ | ||