diff options
Diffstat (limited to 'arch/x86/include/asm/xor.h')
-rw-r--r-- | arch/x86/include/asm/xor.h | 491 |
1 files changed, 490 insertions, 1 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index f8fde90bc45e..d8829751b3f8 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h | |||
@@ -1,10 +1,499 @@ | |||
1 | #ifdef CONFIG_KMEMCHECK | 1 | #ifdef CONFIG_KMEMCHECK |
2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ | 2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ |
3 | # include <asm-generic/xor.h> | 3 | # include <asm-generic/xor.h> |
4 | #elif !defined(_ASM_X86_XOR_H) | ||
5 | #define _ASM_X86_XOR_H | ||
6 | |||
7 | /* | ||
8 | * Optimized RAID-5 checksumming functions for SSE. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2, or (at your option) | ||
13 | * any later version. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
17 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
22 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
23 | */ | ||
24 | |||
25 | /* | ||
26 | * Based on | ||
27 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | ||
28 | * Copyright (C) 1998 Ingo Molnar. | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * x86-64 changes / gcc fixes from Andi Kleen. | ||
33 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
34 | * | ||
35 | * This hasn't been optimized for the hammer yet, but there are likely | ||
36 | * no advantages to be gotten from x86-64 here anyways. | ||
37 | */ | ||
38 | |||
39 | #include <asm/i387.h> | ||
40 | |||
41 | #ifdef CONFIG_X86_32 | ||
42 | /* reduce register pressure */ | ||
43 | # define XOR_CONSTANT_CONSTRAINT "i" | ||
4 | #else | 44 | #else |
45 | # define XOR_CONSTANT_CONSTRAINT "re" | ||
46 | #endif | ||
47 | |||
48 | #define OFFS(x) "16*("#x")" | ||
49 | #define PF_OFFS(x) "256+16*("#x")" | ||
50 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | ||
51 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | ||
52 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | ||
53 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | ||
54 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | ||
55 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | ||
56 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | ||
57 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | ||
58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | ||
59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | ||
60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | ||
61 | #define NOP(x) | ||
62 | |||
63 | #define BLK64(pf, op, i) \ | ||
64 | pf(i) \ | ||
65 | op(i, 0) \ | ||
66 | op(i + 1, 1) \ | ||
67 | op(i + 2, 2) \ | ||
68 | op(i + 3, 3) | ||
69 | |||
70 | static void | ||
71 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
72 | { | ||
73 | unsigned long lines = bytes >> 8; | ||
74 | |||
75 | kernel_fpu_begin(); | ||
76 | |||
77 | asm volatile( | ||
78 | #undef BLOCK | ||
79 | #define BLOCK(i) \ | ||
80 | LD(i, 0) \ | ||
81 | LD(i + 1, 1) \ | ||
82 | PF1(i) \ | ||
83 | PF1(i + 2) \ | ||
84 | LD(i + 2, 2) \ | ||
85 | LD(i + 3, 3) \ | ||
86 | PF0(i + 4) \ | ||
87 | PF0(i + 6) \ | ||
88 | XO1(i, 0) \ | ||
89 | XO1(i + 1, 1) \ | ||
90 | XO1(i + 2, 2) \ | ||
91 | XO1(i + 3, 3) \ | ||
92 | ST(i, 0) \ | ||
93 | ST(i + 1, 1) \ | ||
94 | ST(i + 2, 2) \ | ||
95 | ST(i + 3, 3) \ | ||
96 | |||
97 | |||
98 | PF0(0) | ||
99 | PF0(2) | ||
100 | |||
101 | " .align 32 ;\n" | ||
102 | " 1: ;\n" | ||
103 | |||
104 | BLOCK(0) | ||
105 | BLOCK(4) | ||
106 | BLOCK(8) | ||
107 | BLOCK(12) | ||
108 | |||
109 | " add %[inc], %[p1] ;\n" | ||
110 | " add %[inc], %[p2] ;\n" | ||
111 | " dec %[cnt] ;\n" | ||
112 | " jnz 1b ;\n" | ||
113 | : [cnt] "+r" (lines), | ||
114 | [p1] "+r" (p1), [p2] "+r" (p2) | ||
115 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
116 | : "memory"); | ||
117 | |||
118 | kernel_fpu_end(); | ||
119 | } | ||
120 | |||
121 | static void | ||
122 | xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
123 | { | ||
124 | unsigned long lines = bytes >> 8; | ||
125 | |||
126 | kernel_fpu_begin(); | ||
127 | |||
128 | asm volatile( | ||
129 | #undef BLOCK | ||
130 | #define BLOCK(i) \ | ||
131 | BLK64(PF0, LD, i) \ | ||
132 | BLK64(PF1, XO1, i) \ | ||
133 | BLK64(NOP, ST, i) \ | ||
134 | |||
135 | " .align 32 ;\n" | ||
136 | " 1: ;\n" | ||
137 | |||
138 | BLOCK(0) | ||
139 | BLOCK(4) | ||
140 | BLOCK(8) | ||
141 | BLOCK(12) | ||
142 | |||
143 | " add %[inc], %[p1] ;\n" | ||
144 | " add %[inc], %[p2] ;\n" | ||
145 | " dec %[cnt] ;\n" | ||
146 | " jnz 1b ;\n" | ||
147 | : [cnt] "+r" (lines), | ||
148 | [p1] "+r" (p1), [p2] "+r" (p2) | ||
149 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
150 | : "memory"); | ||
151 | |||
152 | kernel_fpu_end(); | ||
153 | } | ||
154 | |||
155 | static void | ||
156 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
157 | unsigned long *p3) | ||
158 | { | ||
159 | unsigned long lines = bytes >> 8; | ||
160 | |||
161 | kernel_fpu_begin(); | ||
162 | |||
163 | asm volatile( | ||
164 | #undef BLOCK | ||
165 | #define BLOCK(i) \ | ||
166 | PF1(i) \ | ||
167 | PF1(i + 2) \ | ||
168 | LD(i, 0) \ | ||
169 | LD(i + 1, 1) \ | ||
170 | LD(i + 2, 2) \ | ||
171 | LD(i + 3, 3) \ | ||
172 | PF2(i) \ | ||
173 | PF2(i + 2) \ | ||
174 | PF0(i + 4) \ | ||
175 | PF0(i + 6) \ | ||
176 | XO1(i, 0) \ | ||
177 | XO1(i + 1, 1) \ | ||
178 | XO1(i + 2, 2) \ | ||
179 | XO1(i + 3, 3) \ | ||
180 | XO2(i, 0) \ | ||
181 | XO2(i + 1, 1) \ | ||
182 | XO2(i + 2, 2) \ | ||
183 | XO2(i + 3, 3) \ | ||
184 | ST(i, 0) \ | ||
185 | ST(i + 1, 1) \ | ||
186 | ST(i + 2, 2) \ | ||
187 | ST(i + 3, 3) \ | ||
188 | |||
189 | |||
190 | PF0(0) | ||
191 | PF0(2) | ||
192 | |||
193 | " .align 32 ;\n" | ||
194 | " 1: ;\n" | ||
195 | |||
196 | BLOCK(0) | ||
197 | BLOCK(4) | ||
198 | BLOCK(8) | ||
199 | BLOCK(12) | ||
200 | |||
201 | " add %[inc], %[p1] ;\n" | ||
202 | " add %[inc], %[p2] ;\n" | ||
203 | " add %[inc], %[p3] ;\n" | ||
204 | " dec %[cnt] ;\n" | ||
205 | " jnz 1b ;\n" | ||
206 | : [cnt] "+r" (lines), | ||
207 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
208 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
209 | : "memory"); | ||
210 | |||
211 | kernel_fpu_end(); | ||
212 | } | ||
213 | |||
214 | static void | ||
215 | xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
216 | unsigned long *p3) | ||
217 | { | ||
218 | unsigned long lines = bytes >> 8; | ||
219 | |||
220 | kernel_fpu_begin(); | ||
221 | |||
222 | asm volatile( | ||
223 | #undef BLOCK | ||
224 | #define BLOCK(i) \ | ||
225 | BLK64(PF0, LD, i) \ | ||
226 | BLK64(PF1, XO1, i) \ | ||
227 | BLK64(PF2, XO2, i) \ | ||
228 | BLK64(NOP, ST, i) \ | ||
229 | |||
230 | " .align 32 ;\n" | ||
231 | " 1: ;\n" | ||
232 | |||
233 | BLOCK(0) | ||
234 | BLOCK(4) | ||
235 | BLOCK(8) | ||
236 | BLOCK(12) | ||
237 | |||
238 | " add %[inc], %[p1] ;\n" | ||
239 | " add %[inc], %[p2] ;\n" | ||
240 | " add %[inc], %[p3] ;\n" | ||
241 | " dec %[cnt] ;\n" | ||
242 | " jnz 1b ;\n" | ||
243 | : [cnt] "+r" (lines), | ||
244 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
245 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
246 | : "memory"); | ||
247 | |||
248 | kernel_fpu_end(); | ||
249 | } | ||
250 | |||
251 | static void | ||
252 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
253 | unsigned long *p3, unsigned long *p4) | ||
254 | { | ||
255 | unsigned long lines = bytes >> 8; | ||
256 | |||
257 | kernel_fpu_begin(); | ||
258 | |||
259 | asm volatile( | ||
260 | #undef BLOCK | ||
261 | #define BLOCK(i) \ | ||
262 | PF1(i) \ | ||
263 | PF1(i + 2) \ | ||
264 | LD(i, 0) \ | ||
265 | LD(i + 1, 1) \ | ||
266 | LD(i + 2, 2) \ | ||
267 | LD(i + 3, 3) \ | ||
268 | PF2(i) \ | ||
269 | PF2(i + 2) \ | ||
270 | XO1(i, 0) \ | ||
271 | XO1(i + 1, 1) \ | ||
272 | XO1(i + 2, 2) \ | ||
273 | XO1(i + 3, 3) \ | ||
274 | PF3(i) \ | ||
275 | PF3(i + 2) \ | ||
276 | PF0(i + 4) \ | ||
277 | PF0(i + 6) \ | ||
278 | XO2(i, 0) \ | ||
279 | XO2(i + 1, 1) \ | ||
280 | XO2(i + 2, 2) \ | ||
281 | XO2(i + 3, 3) \ | ||
282 | XO3(i, 0) \ | ||
283 | XO3(i + 1, 1) \ | ||
284 | XO3(i + 2, 2) \ | ||
285 | XO3(i + 3, 3) \ | ||
286 | ST(i, 0) \ | ||
287 | ST(i + 1, 1) \ | ||
288 | ST(i + 2, 2) \ | ||
289 | ST(i + 3, 3) \ | ||
290 | |||
291 | |||
292 | PF0(0) | ||
293 | PF0(2) | ||
294 | |||
295 | " .align 32 ;\n" | ||
296 | " 1: ;\n" | ||
297 | |||
298 | BLOCK(0) | ||
299 | BLOCK(4) | ||
300 | BLOCK(8) | ||
301 | BLOCK(12) | ||
302 | |||
303 | " add %[inc], %[p1] ;\n" | ||
304 | " add %[inc], %[p2] ;\n" | ||
305 | " add %[inc], %[p3] ;\n" | ||
306 | " add %[inc], %[p4] ;\n" | ||
307 | " dec %[cnt] ;\n" | ||
308 | " jnz 1b ;\n" | ||
309 | : [cnt] "+r" (lines), [p1] "+r" (p1), | ||
310 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
311 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
312 | : "memory"); | ||
313 | |||
314 | kernel_fpu_end(); | ||
315 | } | ||
316 | |||
317 | static void | ||
318 | xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
319 | unsigned long *p3, unsigned long *p4) | ||
320 | { | ||
321 | unsigned long lines = bytes >> 8; | ||
322 | |||
323 | kernel_fpu_begin(); | ||
324 | |||
325 | asm volatile( | ||
326 | #undef BLOCK | ||
327 | #define BLOCK(i) \ | ||
328 | BLK64(PF0, LD, i) \ | ||
329 | BLK64(PF1, XO1, i) \ | ||
330 | BLK64(PF2, XO2, i) \ | ||
331 | BLK64(PF3, XO3, i) \ | ||
332 | BLK64(NOP, ST, i) \ | ||
333 | |||
334 | " .align 32 ;\n" | ||
335 | " 1: ;\n" | ||
336 | |||
337 | BLOCK(0) | ||
338 | BLOCK(4) | ||
339 | BLOCK(8) | ||
340 | BLOCK(12) | ||
341 | |||
342 | " add %[inc], %[p1] ;\n" | ||
343 | " add %[inc], %[p2] ;\n" | ||
344 | " add %[inc], %[p3] ;\n" | ||
345 | " add %[inc], %[p4] ;\n" | ||
346 | " dec %[cnt] ;\n" | ||
347 | " jnz 1b ;\n" | ||
348 | : [cnt] "+r" (lines), [p1] "+r" (p1), | ||
349 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
350 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
351 | : "memory"); | ||
352 | |||
353 | kernel_fpu_end(); | ||
354 | } | ||
355 | |||
356 | static void | ||
357 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
358 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
359 | { | ||
360 | unsigned long lines = bytes >> 8; | ||
361 | |||
362 | kernel_fpu_begin(); | ||
363 | |||
364 | asm volatile( | ||
365 | #undef BLOCK | ||
366 | #define BLOCK(i) \ | ||
367 | PF1(i) \ | ||
368 | PF1(i + 2) \ | ||
369 | LD(i, 0) \ | ||
370 | LD(i + 1, 1) \ | ||
371 | LD(i + 2, 2) \ | ||
372 | LD(i + 3, 3) \ | ||
373 | PF2(i) \ | ||
374 | PF2(i + 2) \ | ||
375 | XO1(i, 0) \ | ||
376 | XO1(i + 1, 1) \ | ||
377 | XO1(i + 2, 2) \ | ||
378 | XO1(i + 3, 3) \ | ||
379 | PF3(i) \ | ||
380 | PF3(i + 2) \ | ||
381 | XO2(i, 0) \ | ||
382 | XO2(i + 1, 1) \ | ||
383 | XO2(i + 2, 2) \ | ||
384 | XO2(i + 3, 3) \ | ||
385 | PF4(i) \ | ||
386 | PF4(i + 2) \ | ||
387 | PF0(i + 4) \ | ||
388 | PF0(i + 6) \ | ||
389 | XO3(i, 0) \ | ||
390 | XO3(i + 1, 1) \ | ||
391 | XO3(i + 2, 2) \ | ||
392 | XO3(i + 3, 3) \ | ||
393 | XO4(i, 0) \ | ||
394 | XO4(i + 1, 1) \ | ||
395 | XO4(i + 2, 2) \ | ||
396 | XO4(i + 3, 3) \ | ||
397 | ST(i, 0) \ | ||
398 | ST(i + 1, 1) \ | ||
399 | ST(i + 2, 2) \ | ||
400 | ST(i + 3, 3) \ | ||
401 | |||
402 | |||
403 | PF0(0) | ||
404 | PF0(2) | ||
405 | |||
406 | " .align 32 ;\n" | ||
407 | " 1: ;\n" | ||
408 | |||
409 | BLOCK(0) | ||
410 | BLOCK(4) | ||
411 | BLOCK(8) | ||
412 | BLOCK(12) | ||
413 | |||
414 | " add %[inc], %[p1] ;\n" | ||
415 | " add %[inc], %[p2] ;\n" | ||
416 | " add %[inc], %[p3] ;\n" | ||
417 | " add %[inc], %[p4] ;\n" | ||
418 | " add %[inc], %[p5] ;\n" | ||
419 | " dec %[cnt] ;\n" | ||
420 | " jnz 1b ;\n" | ||
421 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | ||
422 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | ||
423 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
424 | : "memory"); | ||
425 | |||
426 | kernel_fpu_end(); | ||
427 | } | ||
428 | |||
429 | static void | ||
430 | xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
431 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
432 | { | ||
433 | unsigned long lines = bytes >> 8; | ||
434 | |||
435 | kernel_fpu_begin(); | ||
436 | |||
437 | asm volatile( | ||
438 | #undef BLOCK | ||
439 | #define BLOCK(i) \ | ||
440 | BLK64(PF0, LD, i) \ | ||
441 | BLK64(PF1, XO1, i) \ | ||
442 | BLK64(PF2, XO2, i) \ | ||
443 | BLK64(PF3, XO3, i) \ | ||
444 | BLK64(PF4, XO4, i) \ | ||
445 | BLK64(NOP, ST, i) \ | ||
446 | |||
447 | " .align 32 ;\n" | ||
448 | " 1: ;\n" | ||
449 | |||
450 | BLOCK(0) | ||
451 | BLOCK(4) | ||
452 | BLOCK(8) | ||
453 | BLOCK(12) | ||
454 | |||
455 | " add %[inc], %[p1] ;\n" | ||
456 | " add %[inc], %[p2] ;\n" | ||
457 | " add %[inc], %[p3] ;\n" | ||
458 | " add %[inc], %[p4] ;\n" | ||
459 | " add %[inc], %[p5] ;\n" | ||
460 | " dec %[cnt] ;\n" | ||
461 | " jnz 1b ;\n" | ||
462 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | ||
463 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | ||
464 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
465 | : "memory"); | ||
466 | |||
467 | kernel_fpu_end(); | ||
468 | } | ||
469 | |||
470 | static struct xor_block_template xor_block_sse_pf64 = { | ||
471 | .name = "prefetch64-sse", | ||
472 | .do_2 = xor_sse_2_pf64, | ||
473 | .do_3 = xor_sse_3_pf64, | ||
474 | .do_4 = xor_sse_4_pf64, | ||
475 | .do_5 = xor_sse_5_pf64, | ||
476 | }; | ||
477 | |||
478 | #undef LD | ||
479 | #undef XO1 | ||
480 | #undef XO2 | ||
481 | #undef XO3 | ||
482 | #undef XO4 | ||
483 | #undef ST | ||
484 | #undef NOP | ||
485 | #undef BLK64 | ||
486 | #undef BLOCK | ||
487 | |||
488 | #undef XOR_CONSTANT_CONSTRAINT | ||
489 | |||
5 | #ifdef CONFIG_X86_32 | 490 | #ifdef CONFIG_X86_32 |
6 | # include <asm/xor_32.h> | 491 | # include <asm/xor_32.h> |
7 | #else | 492 | #else |
8 | # include <asm/xor_64.h> | 493 | # include <asm/xor_64.h> |
9 | #endif | 494 | #endif |
10 | #endif | 495 | |
496 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
497 | AVX_SELECT(FASTEST) | ||
498 | |||
499 | #endif /* _ASM_X86_XOR_H */ | ||