diff options
author | Jan Beulich <JBeulich@suse.com> | 2012-11-02 10:19:18 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-01-25 03:23:50 -0500 |
commit | e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (patch) | |
tree | aefda4da8f27c865d858147864df8f0cd92b0efb /arch | |
parent | e3e81aca8d51a50e19d6c67fafc4c9c4f0404bf1 (diff) |
x86/xor: Unify SSE-base xor-block routines
Besides folding duplicate code, this has the advantage of fixing
x86-64's failure to use proper (para-virtualizable) accessors
for dealing with CR0.TS.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/5093E47602000078000A615B@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/xor.h | 319 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 286 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 295 |
3 files changed, 319 insertions, 581 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index f8fde90bc45e..c661571ca0b7 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h | |||
@@ -1,10 +1,327 @@ | |||
1 | #ifdef CONFIG_KMEMCHECK | 1 | #ifdef CONFIG_KMEMCHECK |
2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ | 2 | /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ |
3 | # include <asm-generic/xor.h> | 3 | # include <asm-generic/xor.h> |
4 | #elif !defined(_ASM_X86_XOR_H) | ||
5 | #define _ASM_X86_XOR_H | ||
6 | |||
7 | /* | ||
8 | * Optimized RAID-5 checksumming functions for SSE. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2, or (at your option) | ||
13 | * any later version. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
17 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
22 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
23 | */ | ||
24 | |||
25 | /* | ||
26 | * Based on | ||
27 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | ||
28 | * Copyright (C) 1998 Ingo Molnar. | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * x86-64 changes / gcc fixes from Andi Kleen. | ||
33 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
34 | * | ||
35 | * This hasn't been optimized for the hammer yet, but there are likely | ||
36 | * no advantages to be gotten from x86-64 here anyways. | ||
37 | */ | ||
38 | |||
39 | #include <asm/i387.h> | ||
40 | |||
41 | #ifdef CONFIG_X86_32 | ||
42 | /* reduce register pressure */ | ||
43 | # define XOR_CONSTANT_CONSTRAINT "i" | ||
4 | #else | 44 | #else |
45 | # define XOR_CONSTANT_CONSTRAINT "re" | ||
46 | #endif | ||
47 | |||
48 | #define OFFS(x) "16*("#x")" | ||
49 | #define PF_OFFS(x) "256+16*("#x")" | ||
50 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | ||
51 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | ||
52 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | ||
53 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | ||
54 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | ||
55 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | ||
56 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | ||
57 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | ||
58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | ||
59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | ||
60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | ||
61 | |||
62 | static void | ||
63 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
64 | { | ||
65 | unsigned long lines = bytes >> 8; | ||
66 | |||
67 | kernel_fpu_begin(); | ||
68 | |||
69 | asm volatile( | ||
70 | #undef BLOCK | ||
71 | #define BLOCK(i) \ | ||
72 | LD(i, 0) \ | ||
73 | LD(i + 1, 1) \ | ||
74 | PF1(i) \ | ||
75 | PF1(i + 2) \ | ||
76 | LD(i + 2, 2) \ | ||
77 | LD(i + 3, 3) \ | ||
78 | PF0(i + 4) \ | ||
79 | PF0(i + 6) \ | ||
80 | XO1(i, 0) \ | ||
81 | XO1(i + 1, 1) \ | ||
82 | XO1(i + 2, 2) \ | ||
83 | XO1(i + 3, 3) \ | ||
84 | ST(i, 0) \ | ||
85 | ST(i + 1, 1) \ | ||
86 | ST(i + 2, 2) \ | ||
87 | ST(i + 3, 3) \ | ||
88 | |||
89 | |||
90 | PF0(0) | ||
91 | PF0(2) | ||
92 | |||
93 | " .align 32 ;\n" | ||
94 | " 1: ;\n" | ||
95 | |||
96 | BLOCK(0) | ||
97 | BLOCK(4) | ||
98 | BLOCK(8) | ||
99 | BLOCK(12) | ||
100 | |||
101 | " add %[inc], %[p1] ;\n" | ||
102 | " add %[inc], %[p2] ;\n" | ||
103 | " dec %[cnt] ;\n" | ||
104 | " jnz 1b ;\n" | ||
105 | : [cnt] "+r" (lines), | ||
106 | [p1] "+r" (p1), [p2] "+r" (p2) | ||
107 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
108 | : "memory"); | ||
109 | |||
110 | kernel_fpu_end(); | ||
111 | } | ||
112 | |||
113 | static void | ||
114 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
115 | unsigned long *p3) | ||
116 | { | ||
117 | unsigned long lines = bytes >> 8; | ||
118 | |||
119 | kernel_fpu_begin(); | ||
120 | |||
121 | asm volatile( | ||
122 | #undef BLOCK | ||
123 | #define BLOCK(i) \ | ||
124 | PF1(i) \ | ||
125 | PF1(i + 2) \ | ||
126 | LD(i, 0) \ | ||
127 | LD(i + 1, 1) \ | ||
128 | LD(i + 2, 2) \ | ||
129 | LD(i + 3, 3) \ | ||
130 | PF2(i) \ | ||
131 | PF2(i + 2) \ | ||
132 | PF0(i + 4) \ | ||
133 | PF0(i + 6) \ | ||
134 | XO1(i, 0) \ | ||
135 | XO1(i + 1, 1) \ | ||
136 | XO1(i + 2, 2) \ | ||
137 | XO1(i + 3, 3) \ | ||
138 | XO2(i, 0) \ | ||
139 | XO2(i + 1, 1) \ | ||
140 | XO2(i + 2, 2) \ | ||
141 | XO2(i + 3, 3) \ | ||
142 | ST(i, 0) \ | ||
143 | ST(i + 1, 1) \ | ||
144 | ST(i + 2, 2) \ | ||
145 | ST(i + 3, 3) \ | ||
146 | |||
147 | |||
148 | PF0(0) | ||
149 | PF0(2) | ||
150 | |||
151 | " .align 32 ;\n" | ||
152 | " 1: ;\n" | ||
153 | |||
154 | BLOCK(0) | ||
155 | BLOCK(4) | ||
156 | BLOCK(8) | ||
157 | BLOCK(12) | ||
158 | |||
159 | " add %[inc], %[p1] ;\n" | ||
160 | " add %[inc], %[p2] ;\n" | ||
161 | " add %[inc], %[p3] ;\n" | ||
162 | " dec %[cnt] ;\n" | ||
163 | " jnz 1b ;\n" | ||
164 | : [cnt] "+r" (lines), | ||
165 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
166 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
167 | : "memory"); | ||
168 | |||
169 | kernel_fpu_end(); | ||
170 | } | ||
171 | |||
172 | static void | ||
173 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
174 | unsigned long *p3, unsigned long *p4) | ||
175 | { | ||
176 | unsigned long lines = bytes >> 8; | ||
177 | |||
178 | kernel_fpu_begin(); | ||
179 | |||
180 | asm volatile( | ||
181 | #undef BLOCK | ||
182 | #define BLOCK(i) \ | ||
183 | PF1(i) \ | ||
184 | PF1(i + 2) \ | ||
185 | LD(i, 0) \ | ||
186 | LD(i + 1, 1) \ | ||
187 | LD(i + 2, 2) \ | ||
188 | LD(i + 3, 3) \ | ||
189 | PF2(i) \ | ||
190 | PF2(i + 2) \ | ||
191 | XO1(i, 0) \ | ||
192 | XO1(i + 1, 1) \ | ||
193 | XO1(i + 2, 2) \ | ||
194 | XO1(i + 3, 3) \ | ||
195 | PF3(i) \ | ||
196 | PF3(i + 2) \ | ||
197 | PF0(i + 4) \ | ||
198 | PF0(i + 6) \ | ||
199 | XO2(i, 0) \ | ||
200 | XO2(i + 1, 1) \ | ||
201 | XO2(i + 2, 2) \ | ||
202 | XO2(i + 3, 3) \ | ||
203 | XO3(i, 0) \ | ||
204 | XO3(i + 1, 1) \ | ||
205 | XO3(i + 2, 2) \ | ||
206 | XO3(i + 3, 3) \ | ||
207 | ST(i, 0) \ | ||
208 | ST(i + 1, 1) \ | ||
209 | ST(i + 2, 2) \ | ||
210 | ST(i + 3, 3) \ | ||
211 | |||
212 | |||
213 | PF0(0) | ||
214 | PF0(2) | ||
215 | |||
216 | " .align 32 ;\n" | ||
217 | " 1: ;\n" | ||
218 | |||
219 | BLOCK(0) | ||
220 | BLOCK(4) | ||
221 | BLOCK(8) | ||
222 | BLOCK(12) | ||
223 | |||
224 | " add %[inc], %[p1] ;\n" | ||
225 | " add %[inc], %[p2] ;\n" | ||
226 | " add %[inc], %[p3] ;\n" | ||
227 | " add %[inc], %[p4] ;\n" | ||
228 | " dec %[cnt] ;\n" | ||
229 | " jnz 1b ;\n" | ||
230 | : [cnt] "+r" (lines), [p1] "+r" (p1), | ||
231 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
232 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
233 | : "memory"); | ||
234 | |||
235 | kernel_fpu_end(); | ||
236 | } | ||
237 | |||
238 | static void | ||
239 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
240 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
241 | { | ||
242 | unsigned long lines = bytes >> 8; | ||
243 | |||
244 | kernel_fpu_begin(); | ||
245 | |||
246 | asm volatile( | ||
247 | #undef BLOCK | ||
248 | #define BLOCK(i) \ | ||
249 | PF1(i) \ | ||
250 | PF1(i + 2) \ | ||
251 | LD(i, 0) \ | ||
252 | LD(i + 1, 1) \ | ||
253 | LD(i + 2, 2) \ | ||
254 | LD(i + 3, 3) \ | ||
255 | PF2(i) \ | ||
256 | PF2(i + 2) \ | ||
257 | XO1(i, 0) \ | ||
258 | XO1(i + 1, 1) \ | ||
259 | XO1(i + 2, 2) \ | ||
260 | XO1(i + 3, 3) \ | ||
261 | PF3(i) \ | ||
262 | PF3(i + 2) \ | ||
263 | XO2(i, 0) \ | ||
264 | XO2(i + 1, 1) \ | ||
265 | XO2(i + 2, 2) \ | ||
266 | XO2(i + 3, 3) \ | ||
267 | PF4(i) \ | ||
268 | PF4(i + 2) \ | ||
269 | PF0(i + 4) \ | ||
270 | PF0(i + 6) \ | ||
271 | XO3(i, 0) \ | ||
272 | XO3(i + 1, 1) \ | ||
273 | XO3(i + 2, 2) \ | ||
274 | XO3(i + 3, 3) \ | ||
275 | XO4(i, 0) \ | ||
276 | XO4(i + 1, 1) \ | ||
277 | XO4(i + 2, 2) \ | ||
278 | XO4(i + 3, 3) \ | ||
279 | ST(i, 0) \ | ||
280 | ST(i + 1, 1) \ | ||
281 | ST(i + 2, 2) \ | ||
282 | ST(i + 3, 3) \ | ||
283 | |||
284 | |||
285 | PF0(0) | ||
286 | PF0(2) | ||
287 | |||
288 | " .align 32 ;\n" | ||
289 | " 1: ;\n" | ||
290 | |||
291 | BLOCK(0) | ||
292 | BLOCK(4) | ||
293 | BLOCK(8) | ||
294 | BLOCK(12) | ||
295 | |||
296 | " add %[inc], %[p1] ;\n" | ||
297 | " add %[inc], %[p2] ;\n" | ||
298 | " add %[inc], %[p3] ;\n" | ||
299 | " add %[inc], %[p4] ;\n" | ||
300 | " add %[inc], %[p5] ;\n" | ||
301 | " dec %[cnt] ;\n" | ||
302 | " jnz 1b ;\n" | ||
303 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | ||
304 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | ||
305 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
306 | : "memory"); | ||
307 | |||
308 | kernel_fpu_end(); | ||
309 | } | ||
310 | |||
311 | #undef LD | ||
312 | #undef XO1 | ||
313 | #undef XO2 | ||
314 | #undef XO3 | ||
315 | #undef XO4 | ||
316 | #undef ST | ||
317 | #undef BLOCK | ||
318 | |||
319 | #undef XOR_CONSTANT_CONSTRAINT | ||
320 | |||
5 | #ifdef CONFIG_X86_32 | 321 | #ifdef CONFIG_X86_32 |
6 | # include <asm/xor_32.h> | 322 | # include <asm/xor_32.h> |
7 | #else | 323 | #else |
8 | # include <asm/xor_64.h> | 324 | # include <asm/xor_64.h> |
9 | #endif | 325 | #endif |
10 | #endif | 326 | |
327 | #endif /* _ASM_X86_XOR_H */ | ||
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index f79cb7ec0e06..b85dc87f3cc7 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
@@ -2,7 +2,7 @@ | |||
2 | #define _ASM_X86_XOR_32_H | 2 | #define _ASM_X86_XOR_32_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * Optimized RAID-5 checksumming functions for MMX and SSE. | 5 | * Optimized RAID-5 checksumming functions for MMX. |
6 | * | 6 | * |
7 | * This program is free software; you can redistribute it and/or modify | 7 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 8 | * it under the terms of the GNU General Public License as published by |
@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = { | |||
529 | .do_5 = xor_p5_mmx_5, | 529 | .do_5 = xor_p5_mmx_5, |
530 | }; | 530 | }; |
531 | 531 | ||
532 | /* | ||
533 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
534 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
535 | */ | ||
536 | |||
537 | #define OFFS(x) "16*("#x")" | ||
538 | #define PF_OFFS(x) "256+16*("#x")" | ||
539 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | ||
540 | #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | ||
541 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | ||
542 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | ||
543 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | ||
544 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | ||
545 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | ||
546 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | ||
547 | #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | ||
548 | #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | ||
549 | #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | ||
550 | #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | ||
551 | #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | ||
552 | |||
553 | |||
554 | static void | ||
555 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
556 | { | ||
557 | unsigned long lines = bytes >> 8; | ||
558 | |||
559 | kernel_fpu_begin(); | ||
560 | |||
561 | asm volatile( | ||
562 | #undef BLOCK | ||
563 | #define BLOCK(i) \ | ||
564 | LD(i, 0) \ | ||
565 | LD(i + 1, 1) \ | ||
566 | PF1(i) \ | ||
567 | PF1(i + 2) \ | ||
568 | LD(i + 2, 2) \ | ||
569 | LD(i + 3, 3) \ | ||
570 | PF0(i + 4) \ | ||
571 | PF0(i + 6) \ | ||
572 | XO1(i, 0) \ | ||
573 | XO1(i + 1, 1) \ | ||
574 | XO1(i + 2, 2) \ | ||
575 | XO1(i + 3, 3) \ | ||
576 | ST(i, 0) \ | ||
577 | ST(i + 1, 1) \ | ||
578 | ST(i + 2, 2) \ | ||
579 | ST(i + 3, 3) \ | ||
580 | |||
581 | |||
582 | PF0(0) | ||
583 | PF0(2) | ||
584 | |||
585 | " .align 32 ;\n" | ||
586 | " 1: ;\n" | ||
587 | |||
588 | BLOCK(0) | ||
589 | BLOCK(4) | ||
590 | BLOCK(8) | ||
591 | BLOCK(12) | ||
592 | |||
593 | " addl $256, %1 ;\n" | ||
594 | " addl $256, %2 ;\n" | ||
595 | " decl %0 ;\n" | ||
596 | " jnz 1b ;\n" | ||
597 | : "+r" (lines), | ||
598 | "+r" (p1), "+r" (p2) | ||
599 | : | ||
600 | : "memory"); | ||
601 | |||
602 | kernel_fpu_end(); | ||
603 | } | ||
604 | |||
605 | static void | ||
606 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
607 | unsigned long *p3) | ||
608 | { | ||
609 | unsigned long lines = bytes >> 8; | ||
610 | |||
611 | kernel_fpu_begin(); | ||
612 | |||
613 | asm volatile( | ||
614 | #undef BLOCK | ||
615 | #define BLOCK(i) \ | ||
616 | PF1(i) \ | ||
617 | PF1(i + 2) \ | ||
618 | LD(i,0) \ | ||
619 | LD(i + 1, 1) \ | ||
620 | LD(i + 2, 2) \ | ||
621 | LD(i + 3, 3) \ | ||
622 | PF2(i) \ | ||
623 | PF2(i + 2) \ | ||
624 | PF0(i + 4) \ | ||
625 | PF0(i + 6) \ | ||
626 | XO1(i,0) \ | ||
627 | XO1(i + 1, 1) \ | ||
628 | XO1(i + 2, 2) \ | ||
629 | XO1(i + 3, 3) \ | ||
630 | XO2(i,0) \ | ||
631 | XO2(i + 1, 1) \ | ||
632 | XO2(i + 2, 2) \ | ||
633 | XO2(i + 3, 3) \ | ||
634 | ST(i,0) \ | ||
635 | ST(i + 1, 1) \ | ||
636 | ST(i + 2, 2) \ | ||
637 | ST(i + 3, 3) \ | ||
638 | |||
639 | |||
640 | PF0(0) | ||
641 | PF0(2) | ||
642 | |||
643 | " .align 32 ;\n" | ||
644 | " 1: ;\n" | ||
645 | |||
646 | BLOCK(0) | ||
647 | BLOCK(4) | ||
648 | BLOCK(8) | ||
649 | BLOCK(12) | ||
650 | |||
651 | " addl $256, %1 ;\n" | ||
652 | " addl $256, %2 ;\n" | ||
653 | " addl $256, %3 ;\n" | ||
654 | " decl %0 ;\n" | ||
655 | " jnz 1b ;\n" | ||
656 | : "+r" (lines), | ||
657 | "+r" (p1), "+r"(p2), "+r"(p3) | ||
658 | : | ||
659 | : "memory" ); | ||
660 | |||
661 | kernel_fpu_end(); | ||
662 | } | ||
663 | |||
664 | static void | ||
665 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
666 | unsigned long *p3, unsigned long *p4) | ||
667 | { | ||
668 | unsigned long lines = bytes >> 8; | ||
669 | |||
670 | kernel_fpu_begin(); | ||
671 | |||
672 | asm volatile( | ||
673 | #undef BLOCK | ||
674 | #define BLOCK(i) \ | ||
675 | PF1(i) \ | ||
676 | PF1(i + 2) \ | ||
677 | LD(i,0) \ | ||
678 | LD(i + 1, 1) \ | ||
679 | LD(i + 2, 2) \ | ||
680 | LD(i + 3, 3) \ | ||
681 | PF2(i) \ | ||
682 | PF2(i + 2) \ | ||
683 | XO1(i,0) \ | ||
684 | XO1(i + 1, 1) \ | ||
685 | XO1(i + 2, 2) \ | ||
686 | XO1(i + 3, 3) \ | ||
687 | PF3(i) \ | ||
688 | PF3(i + 2) \ | ||
689 | PF0(i + 4) \ | ||
690 | PF0(i + 6) \ | ||
691 | XO2(i,0) \ | ||
692 | XO2(i + 1, 1) \ | ||
693 | XO2(i + 2, 2) \ | ||
694 | XO2(i + 3, 3) \ | ||
695 | XO3(i,0) \ | ||
696 | XO3(i + 1, 1) \ | ||
697 | XO3(i + 2, 2) \ | ||
698 | XO3(i + 3, 3) \ | ||
699 | ST(i,0) \ | ||
700 | ST(i + 1, 1) \ | ||
701 | ST(i + 2, 2) \ | ||
702 | ST(i + 3, 3) \ | ||
703 | |||
704 | |||
705 | PF0(0) | ||
706 | PF0(2) | ||
707 | |||
708 | " .align 32 ;\n" | ||
709 | " 1: ;\n" | ||
710 | |||
711 | BLOCK(0) | ||
712 | BLOCK(4) | ||
713 | BLOCK(8) | ||
714 | BLOCK(12) | ||
715 | |||
716 | " addl $256, %1 ;\n" | ||
717 | " addl $256, %2 ;\n" | ||
718 | " addl $256, %3 ;\n" | ||
719 | " addl $256, %4 ;\n" | ||
720 | " decl %0 ;\n" | ||
721 | " jnz 1b ;\n" | ||
722 | : "+r" (lines), | ||
723 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
724 | : | ||
725 | : "memory" ); | ||
726 | |||
727 | kernel_fpu_end(); | ||
728 | } | ||
729 | |||
730 | static void | ||
731 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
732 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
733 | { | ||
734 | unsigned long lines = bytes >> 8; | ||
735 | |||
736 | kernel_fpu_begin(); | ||
737 | |||
738 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
739 | such that it won't pass to the asm volatile below a | ||
740 | register that is shared with any other variable. That's | ||
741 | because we modify p4 and p5 there, but we can't mark them | ||
742 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
743 | limit of GCC < 3.1. */ | ||
744 | asm("" : "+r" (p4), "+r" (p5)); | ||
745 | |||
746 | asm volatile( | ||
747 | #undef BLOCK | ||
748 | #define BLOCK(i) \ | ||
749 | PF1(i) \ | ||
750 | PF1(i + 2) \ | ||
751 | LD(i,0) \ | ||
752 | LD(i + 1, 1) \ | ||
753 | LD(i + 2, 2) \ | ||
754 | LD(i + 3, 3) \ | ||
755 | PF2(i) \ | ||
756 | PF2(i + 2) \ | ||
757 | XO1(i,0) \ | ||
758 | XO1(i + 1, 1) \ | ||
759 | XO1(i + 2, 2) \ | ||
760 | XO1(i + 3, 3) \ | ||
761 | PF3(i) \ | ||
762 | PF3(i + 2) \ | ||
763 | XO2(i,0) \ | ||
764 | XO2(i + 1, 1) \ | ||
765 | XO2(i + 2, 2) \ | ||
766 | XO2(i + 3, 3) \ | ||
767 | PF4(i) \ | ||
768 | PF4(i + 2) \ | ||
769 | PF0(i + 4) \ | ||
770 | PF0(i + 6) \ | ||
771 | XO3(i,0) \ | ||
772 | XO3(i + 1, 1) \ | ||
773 | XO3(i + 2, 2) \ | ||
774 | XO3(i + 3, 3) \ | ||
775 | XO4(i,0) \ | ||
776 | XO4(i + 1, 1) \ | ||
777 | XO4(i + 2, 2) \ | ||
778 | XO4(i + 3, 3) \ | ||
779 | ST(i,0) \ | ||
780 | ST(i + 1, 1) \ | ||
781 | ST(i + 2, 2) \ | ||
782 | ST(i + 3, 3) \ | ||
783 | |||
784 | |||
785 | PF0(0) | ||
786 | PF0(2) | ||
787 | |||
788 | " .align 32 ;\n" | ||
789 | " 1: ;\n" | ||
790 | |||
791 | BLOCK(0) | ||
792 | BLOCK(4) | ||
793 | BLOCK(8) | ||
794 | BLOCK(12) | ||
795 | |||
796 | " addl $256, %1 ;\n" | ||
797 | " addl $256, %2 ;\n" | ||
798 | " addl $256, %3 ;\n" | ||
799 | " addl $256, %4 ;\n" | ||
800 | " addl $256, %5 ;\n" | ||
801 | " decl %0 ;\n" | ||
802 | " jnz 1b ;\n" | ||
803 | : "+r" (lines), | ||
804 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
805 | : "r" (p4), "r" (p5) | ||
806 | : "memory"); | ||
807 | |||
808 | /* p4 and p5 were modified, and now the variables are dead. | ||
809 | Clobber them just to be sure nobody does something stupid | ||
810 | like assuming they have some legal value. */ | ||
811 | asm("" : "=r" (p4), "=r" (p5)); | ||
812 | |||
813 | kernel_fpu_end(); | ||
814 | } | ||
815 | |||
816 | static struct xor_block_template xor_block_pIII_sse = { | 532 | static struct xor_block_template xor_block_pIII_sse = { |
817 | .name = "pIII_sse", | 533 | .name = "pIII_sse", |
818 | .do_2 = xor_sse_2, | 534 | .do_2 = xor_sse_2, |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 87ac522c4af5..1baf89dcc423 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
@@ -1,301 +1,6 @@ | |||
1 | #ifndef _ASM_X86_XOR_64_H | 1 | #ifndef _ASM_X86_XOR_64_H |
2 | #define _ASM_X86_XOR_64_H | 2 | #define _ASM_X86_XOR_64_H |
3 | 3 | ||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for MMX and SSE. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2, or (at your option) | ||
10 | * any later version. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
14 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
15 | */ | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
20 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * Based on | ||
25 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | ||
26 | * Copyright (C) 1998 Ingo Molnar. | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * x86-64 changes / gcc fixes from Andi Kleen. | ||
31 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
32 | * | ||
33 | * This hasn't been optimized for the hammer yet, but there are likely | ||
34 | * no advantages to be gotten from x86-64 here anyways. | ||
35 | */ | ||
36 | |||
37 | #include <asm/i387.h> | ||
38 | |||
39 | #define OFFS(x) "16*("#x")" | ||
40 | #define PF_OFFS(x) "256+16*("#x")" | ||
41 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | ||
42 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | ||
43 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | ||
44 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | ||
45 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | ||
46 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | ||
47 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | ||
48 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" | ||
49 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | ||
50 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | ||
51 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | ||
52 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | ||
53 | #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" | ||
54 | |||
55 | |||
56 | static void | ||
57 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
58 | { | ||
59 | unsigned int lines = bytes >> 8; | ||
60 | |||
61 | kernel_fpu_begin(); | ||
62 | |||
63 | asm volatile( | ||
64 | #undef BLOCK | ||
65 | #define BLOCK(i) \ | ||
66 | LD(i, 0) \ | ||
67 | LD(i + 1, 1) \ | ||
68 | PF1(i) \ | ||
69 | PF1(i + 2) \ | ||
70 | LD(i + 2, 2) \ | ||
71 | LD(i + 3, 3) \ | ||
72 | PF0(i + 4) \ | ||
73 | PF0(i + 6) \ | ||
74 | XO1(i, 0) \ | ||
75 | XO1(i + 1, 1) \ | ||
76 | XO1(i + 2, 2) \ | ||
77 | XO1(i + 3, 3) \ | ||
78 | ST(i, 0) \ | ||
79 | ST(i + 1, 1) \ | ||
80 | ST(i + 2, 2) \ | ||
81 | ST(i + 3, 3) \ | ||
82 | |||
83 | |||
84 | PF0(0) | ||
85 | PF0(2) | ||
86 | |||
87 | " .align 32 ;\n" | ||
88 | " 1: ;\n" | ||
89 | |||
90 | BLOCK(0) | ||
91 | BLOCK(4) | ||
92 | BLOCK(8) | ||
93 | BLOCK(12) | ||
94 | |||
95 | " addq %[inc], %[p1] ;\n" | ||
96 | " addq %[inc], %[p2] ;\n" | ||
97 | " decl %[cnt] ; jnz 1b" | ||
98 | : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) | ||
99 | : [inc] "r" (256UL) | ||
100 | : "memory"); | ||
101 | |||
102 | kernel_fpu_end(); | ||
103 | } | ||
104 | |||
105 | static void | ||
106 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
107 | unsigned long *p3) | ||
108 | { | ||
109 | unsigned int lines = bytes >> 8; | ||
110 | |||
111 | kernel_fpu_begin(); | ||
112 | asm volatile( | ||
113 | #undef BLOCK | ||
114 | #define BLOCK(i) \ | ||
115 | PF1(i) \ | ||
116 | PF1(i + 2) \ | ||
117 | LD(i, 0) \ | ||
118 | LD(i + 1, 1) \ | ||
119 | LD(i + 2, 2) \ | ||
120 | LD(i + 3, 3) \ | ||
121 | PF2(i) \ | ||
122 | PF2(i + 2) \ | ||
123 | PF0(i + 4) \ | ||
124 | PF0(i + 6) \ | ||
125 | XO1(i, 0) \ | ||
126 | XO1(i + 1, 1) \ | ||
127 | XO1(i + 2, 2) \ | ||
128 | XO1(i + 3, 3) \ | ||
129 | XO2(i, 0) \ | ||
130 | XO2(i + 1, 1) \ | ||
131 | XO2(i + 2, 2) \ | ||
132 | XO2(i + 3, 3) \ | ||
133 | ST(i, 0) \ | ||
134 | ST(i + 1, 1) \ | ||
135 | ST(i + 2, 2) \ | ||
136 | ST(i + 3, 3) \ | ||
137 | |||
138 | |||
139 | PF0(0) | ||
140 | PF0(2) | ||
141 | |||
142 | " .align 32 ;\n" | ||
143 | " 1: ;\n" | ||
144 | |||
145 | BLOCK(0) | ||
146 | BLOCK(4) | ||
147 | BLOCK(8) | ||
148 | BLOCK(12) | ||
149 | |||
150 | " addq %[inc], %[p1] ;\n" | ||
151 | " addq %[inc], %[p2] ;\n" | ||
152 | " addq %[inc], %[p3] ;\n" | ||
153 | " decl %[cnt] ; jnz 1b" | ||
154 | : [cnt] "+r" (lines), | ||
155 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
156 | : [inc] "r" (256UL) | ||
157 | : "memory"); | ||
158 | kernel_fpu_end(); | ||
159 | } | ||
160 | |||
161 | static void | ||
162 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
163 | unsigned long *p3, unsigned long *p4) | ||
164 | { | ||
165 | unsigned int lines = bytes >> 8; | ||
166 | |||
167 | kernel_fpu_begin(); | ||
168 | |||
169 | asm volatile( | ||
170 | #undef BLOCK | ||
171 | #define BLOCK(i) \ | ||
172 | PF1(i) \ | ||
173 | PF1(i + 2) \ | ||
174 | LD(i, 0) \ | ||
175 | LD(i + 1, 1) \ | ||
176 | LD(i + 2, 2) \ | ||
177 | LD(i + 3, 3) \ | ||
178 | PF2(i) \ | ||
179 | PF2(i + 2) \ | ||
180 | XO1(i, 0) \ | ||
181 | XO1(i + 1, 1) \ | ||
182 | XO1(i + 2, 2) \ | ||
183 | XO1(i + 3, 3) \ | ||
184 | PF3(i) \ | ||
185 | PF3(i + 2) \ | ||
186 | PF0(i + 4) \ | ||
187 | PF0(i + 6) \ | ||
188 | XO2(i, 0) \ | ||
189 | XO2(i + 1, 1) \ | ||
190 | XO2(i + 2, 2) \ | ||
191 | XO2(i + 3, 3) \ | ||
192 | XO3(i, 0) \ | ||
193 | XO3(i + 1, 1) \ | ||
194 | XO3(i + 2, 2) \ | ||
195 | XO3(i + 3, 3) \ | ||
196 | ST(i, 0) \ | ||
197 | ST(i + 1, 1) \ | ||
198 | ST(i + 2, 2) \ | ||
199 | ST(i + 3, 3) \ | ||
200 | |||
201 | |||
202 | PF0(0) | ||
203 | PF0(2) | ||
204 | |||
205 | " .align 32 ;\n" | ||
206 | " 1: ;\n" | ||
207 | |||
208 | BLOCK(0) | ||
209 | BLOCK(4) | ||
210 | BLOCK(8) | ||
211 | BLOCK(12) | ||
212 | |||
213 | " addq %[inc], %[p1] ;\n" | ||
214 | " addq %[inc], %[p2] ;\n" | ||
215 | " addq %[inc], %[p3] ;\n" | ||
216 | " addq %[inc], %[p4] ;\n" | ||
217 | " decl %[cnt] ; jnz 1b" | ||
218 | : [cnt] "+c" (lines), | ||
219 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
220 | : [inc] "r" (256UL) | ||
221 | : "memory" ); | ||
222 | |||
223 | kernel_fpu_end(); | ||
224 | } | ||
225 | |||
226 | static void | ||
227 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
228 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
229 | { | ||
230 | unsigned int lines = bytes >> 8; | ||
231 | |||
232 | kernel_fpu_begin(); | ||
233 | |||
234 | asm volatile( | ||
235 | #undef BLOCK | ||
236 | #define BLOCK(i) \ | ||
237 | PF1(i) \ | ||
238 | PF1(i + 2) \ | ||
239 | LD(i, 0) \ | ||
240 | LD(i + 1, 1) \ | ||
241 | LD(i + 2, 2) \ | ||
242 | LD(i + 3, 3) \ | ||
243 | PF2(i) \ | ||
244 | PF2(i + 2) \ | ||
245 | XO1(i, 0) \ | ||
246 | XO1(i + 1, 1) \ | ||
247 | XO1(i + 2, 2) \ | ||
248 | XO1(i + 3, 3) \ | ||
249 | PF3(i) \ | ||
250 | PF3(i + 2) \ | ||
251 | XO2(i, 0) \ | ||
252 | XO2(i + 1, 1) \ | ||
253 | XO2(i + 2, 2) \ | ||
254 | XO2(i + 3, 3) \ | ||
255 | PF4(i) \ | ||
256 | PF4(i + 2) \ | ||
257 | PF0(i + 4) \ | ||
258 | PF0(i + 6) \ | ||
259 | XO3(i, 0) \ | ||
260 | XO3(i + 1, 1) \ | ||
261 | XO3(i + 2, 2) \ | ||
262 | XO3(i + 3, 3) \ | ||
263 | XO4(i, 0) \ | ||
264 | XO4(i + 1, 1) \ | ||
265 | XO4(i + 2, 2) \ | ||
266 | XO4(i + 3, 3) \ | ||
267 | ST(i, 0) \ | ||
268 | ST(i + 1, 1) \ | ||
269 | ST(i + 2, 2) \ | ||
270 | ST(i + 3, 3) \ | ||
271 | |||
272 | |||
273 | PF0(0) | ||
274 | PF0(2) | ||
275 | |||
276 | " .align 32 ;\n" | ||
277 | " 1: ;\n" | ||
278 | |||
279 | BLOCK(0) | ||
280 | BLOCK(4) | ||
281 | BLOCK(8) | ||
282 | BLOCK(12) | ||
283 | |||
284 | " addq %[inc], %[p1] ;\n" | ||
285 | " addq %[inc], %[p2] ;\n" | ||
286 | " addq %[inc], %[p3] ;\n" | ||
287 | " addq %[inc], %[p4] ;\n" | ||
288 | " addq %[inc], %[p5] ;\n" | ||
289 | " decl %[cnt] ; jnz 1b" | ||
290 | : [cnt] "+c" (lines), | ||
291 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), | ||
292 | [p5] "+r" (p5) | ||
293 | : [inc] "r" (256UL) | ||
294 | : "memory"); | ||
295 | |||
296 | kernel_fpu_end(); | ||
297 | } | ||
298 | |||
299 | static struct xor_block_template xor_block_sse = { | 4 | static struct xor_block_template xor_block_sse = { |
300 | .name = "generic_sse", | 5 | .name = "generic_sse", |
301 | .do_2 = xor_sse_2, | 6 | .do_2 = xor_sse_2, |