diff options
Diffstat (limited to 'include/asm-x86/xor_32.h')
-rw-r--r-- | include/asm-x86/xor_32.h | 888 |
1 files changed, 0 insertions, 888 deletions
diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h deleted file mode 100644 index 921b45840449..000000000000 --- a/include/asm-x86/xor_32.h +++ /dev/null | |||
@@ -1,888 +0,0 @@ | |||
1 | #ifndef ASM_X86__XOR_32_H | ||
2 | #define ASM_X86__XOR_32_H | ||
3 | |||
4 | /* | ||
5 | * Optimized RAID-5 checksumming functions for MMX and SSE. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2, or (at your option) | ||
10 | * any later version. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
14 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | ||
19 | * Copyright (C) 1998 Ingo Molnar. | ||
20 | */ | ||
21 | |||
22 | #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" | ||
23 | #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | ||
24 | #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | ||
25 | #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | ||
26 | #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | ||
27 | #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | ||
28 | |||
29 | #include <asm/i387.h> | ||
30 | |||
31 | static void | ||
32 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
33 | { | ||
34 | unsigned long lines = bytes >> 7; | ||
35 | |||
36 | kernel_fpu_begin(); | ||
37 | |||
38 | asm volatile( | ||
39 | #undef BLOCK | ||
40 | #define BLOCK(i) \ | ||
41 | LD(i, 0) \ | ||
42 | LD(i + 1, 1) \ | ||
43 | LD(i + 2, 2) \ | ||
44 | LD(i + 3, 3) \ | ||
45 | XO1(i, 0) \ | ||
46 | ST(i, 0) \ | ||
47 | XO1(i+1, 1) \ | ||
48 | ST(i+1, 1) \ | ||
49 | XO1(i + 2, 2) \ | ||
50 | ST(i + 2, 2) \ | ||
51 | XO1(i + 3, 3) \ | ||
52 | ST(i + 3, 3) | ||
53 | |||
54 | " .align 32 ;\n" | ||
55 | " 1: ;\n" | ||
56 | |||
57 | BLOCK(0) | ||
58 | BLOCK(4) | ||
59 | BLOCK(8) | ||
60 | BLOCK(12) | ||
61 | |||
62 | " addl $128, %1 ;\n" | ||
63 | " addl $128, %2 ;\n" | ||
64 | " decl %0 ;\n" | ||
65 | " jnz 1b ;\n" | ||
66 | : "+r" (lines), | ||
67 | "+r" (p1), "+r" (p2) | ||
68 | : | ||
69 | : "memory"); | ||
70 | |||
71 | kernel_fpu_end(); | ||
72 | } | ||
73 | |||
74 | static void | ||
75 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
76 | unsigned long *p3) | ||
77 | { | ||
78 | unsigned long lines = bytes >> 7; | ||
79 | |||
80 | kernel_fpu_begin(); | ||
81 | |||
82 | asm volatile( | ||
83 | #undef BLOCK | ||
84 | #define BLOCK(i) \ | ||
85 | LD(i, 0) \ | ||
86 | LD(i + 1, 1) \ | ||
87 | LD(i + 2, 2) \ | ||
88 | LD(i + 3, 3) \ | ||
89 | XO1(i, 0) \ | ||
90 | XO1(i + 1, 1) \ | ||
91 | XO1(i + 2, 2) \ | ||
92 | XO1(i + 3, 3) \ | ||
93 | XO2(i, 0) \ | ||
94 | ST(i, 0) \ | ||
95 | XO2(i + 1, 1) \ | ||
96 | ST(i + 1, 1) \ | ||
97 | XO2(i + 2, 2) \ | ||
98 | ST(i + 2, 2) \ | ||
99 | XO2(i + 3, 3) \ | ||
100 | ST(i + 3, 3) | ||
101 | |||
102 | " .align 32 ;\n" | ||
103 | " 1: ;\n" | ||
104 | |||
105 | BLOCK(0) | ||
106 | BLOCK(4) | ||
107 | BLOCK(8) | ||
108 | BLOCK(12) | ||
109 | |||
110 | " addl $128, %1 ;\n" | ||
111 | " addl $128, %2 ;\n" | ||
112 | " addl $128, %3 ;\n" | ||
113 | " decl %0 ;\n" | ||
114 | " jnz 1b ;\n" | ||
115 | : "+r" (lines), | ||
116 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
117 | : | ||
118 | : "memory"); | ||
119 | |||
120 | kernel_fpu_end(); | ||
121 | } | ||
122 | |||
123 | static void | ||
124 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
125 | unsigned long *p3, unsigned long *p4) | ||
126 | { | ||
127 | unsigned long lines = bytes >> 7; | ||
128 | |||
129 | kernel_fpu_begin(); | ||
130 | |||
131 | asm volatile( | ||
132 | #undef BLOCK | ||
133 | #define BLOCK(i) \ | ||
134 | LD(i, 0) \ | ||
135 | LD(i + 1, 1) \ | ||
136 | LD(i + 2, 2) \ | ||
137 | LD(i + 3, 3) \ | ||
138 | XO1(i, 0) \ | ||
139 | XO1(i + 1, 1) \ | ||
140 | XO1(i + 2, 2) \ | ||
141 | XO1(i + 3, 3) \ | ||
142 | XO2(i, 0) \ | ||
143 | XO2(i + 1, 1) \ | ||
144 | XO2(i + 2, 2) \ | ||
145 | XO2(i + 3, 3) \ | ||
146 | XO3(i, 0) \ | ||
147 | ST(i, 0) \ | ||
148 | XO3(i + 1, 1) \ | ||
149 | ST(i + 1, 1) \ | ||
150 | XO3(i + 2, 2) \ | ||
151 | ST(i + 2, 2) \ | ||
152 | XO3(i + 3, 3) \ | ||
153 | ST(i + 3, 3) | ||
154 | |||
155 | " .align 32 ;\n" | ||
156 | " 1: ;\n" | ||
157 | |||
158 | BLOCK(0) | ||
159 | BLOCK(4) | ||
160 | BLOCK(8) | ||
161 | BLOCK(12) | ||
162 | |||
163 | " addl $128, %1 ;\n" | ||
164 | " addl $128, %2 ;\n" | ||
165 | " addl $128, %3 ;\n" | ||
166 | " addl $128, %4 ;\n" | ||
167 | " decl %0 ;\n" | ||
168 | " jnz 1b ;\n" | ||
169 | : "+r" (lines), | ||
170 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
171 | : | ||
172 | : "memory"); | ||
173 | |||
174 | kernel_fpu_end(); | ||
175 | } | ||
176 | |||
177 | |||
178 | static void | ||
179 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
180 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
181 | { | ||
182 | unsigned long lines = bytes >> 7; | ||
183 | |||
184 | kernel_fpu_begin(); | ||
185 | |||
186 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
187 | such that it won't pass to the asm volatile below a | ||
188 | register that is shared with any other variable. That's | ||
189 | because we modify p4 and p5 there, but we can't mark them | ||
190 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
191 | limit of GCC < 3.1. */ | ||
192 | asm("" : "+r" (p4), "+r" (p5)); | ||
193 | |||
194 | asm volatile( | ||
195 | #undef BLOCK | ||
196 | #define BLOCK(i) \ | ||
197 | LD(i, 0) \ | ||
198 | LD(i + 1, 1) \ | ||
199 | LD(i + 2, 2) \ | ||
200 | LD(i + 3, 3) \ | ||
201 | XO1(i, 0) \ | ||
202 | XO1(i + 1, 1) \ | ||
203 | XO1(i + 2, 2) \ | ||
204 | XO1(i + 3, 3) \ | ||
205 | XO2(i, 0) \ | ||
206 | XO2(i + 1, 1) \ | ||
207 | XO2(i + 2, 2) \ | ||
208 | XO2(i + 3, 3) \ | ||
209 | XO3(i, 0) \ | ||
210 | XO3(i + 1, 1) \ | ||
211 | XO3(i + 2, 2) \ | ||
212 | XO3(i + 3, 3) \ | ||
213 | XO4(i, 0) \ | ||
214 | ST(i, 0) \ | ||
215 | XO4(i + 1, 1) \ | ||
216 | ST(i + 1, 1) \ | ||
217 | XO4(i + 2, 2) \ | ||
218 | ST(i + 2, 2) \ | ||
219 | XO4(i + 3, 3) \ | ||
220 | ST(i + 3, 3) | ||
221 | |||
222 | " .align 32 ;\n" | ||
223 | " 1: ;\n" | ||
224 | |||
225 | BLOCK(0) | ||
226 | BLOCK(4) | ||
227 | BLOCK(8) | ||
228 | BLOCK(12) | ||
229 | |||
230 | " addl $128, %1 ;\n" | ||
231 | " addl $128, %2 ;\n" | ||
232 | " addl $128, %3 ;\n" | ||
233 | " addl $128, %4 ;\n" | ||
234 | " addl $128, %5 ;\n" | ||
235 | " decl %0 ;\n" | ||
236 | " jnz 1b ;\n" | ||
237 | : "+r" (lines), | ||
238 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
239 | : "r" (p4), "r" (p5) | ||
240 | : "memory"); | ||
241 | |||
242 | /* p4 and p5 were modified, and now the variables are dead. | ||
243 | Clobber them just to be sure nobody does something stupid | ||
244 | like assuming they have some legal value. */ | ||
245 | asm("" : "=r" (p4), "=r" (p5)); | ||
246 | |||
247 | kernel_fpu_end(); | ||
248 | } | ||
249 | |||
250 | #undef LD | ||
251 | #undef XO1 | ||
252 | #undef XO2 | ||
253 | #undef XO3 | ||
254 | #undef XO4 | ||
255 | #undef ST | ||
256 | #undef BLOCK | ||
257 | |||
258 | static void | ||
259 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
260 | { | ||
261 | unsigned long lines = bytes >> 6; | ||
262 | |||
263 | kernel_fpu_begin(); | ||
264 | |||
265 | asm volatile( | ||
266 | " .align 32 ;\n" | ||
267 | " 1: ;\n" | ||
268 | " movq (%1), %%mm0 ;\n" | ||
269 | " movq 8(%1), %%mm1 ;\n" | ||
270 | " pxor (%2), %%mm0 ;\n" | ||
271 | " movq 16(%1), %%mm2 ;\n" | ||
272 | " movq %%mm0, (%1) ;\n" | ||
273 | " pxor 8(%2), %%mm1 ;\n" | ||
274 | " movq 24(%1), %%mm3 ;\n" | ||
275 | " movq %%mm1, 8(%1) ;\n" | ||
276 | " pxor 16(%2), %%mm2 ;\n" | ||
277 | " movq 32(%1), %%mm4 ;\n" | ||
278 | " movq %%mm2, 16(%1) ;\n" | ||
279 | " pxor 24(%2), %%mm3 ;\n" | ||
280 | " movq 40(%1), %%mm5 ;\n" | ||
281 | " movq %%mm3, 24(%1) ;\n" | ||
282 | " pxor 32(%2), %%mm4 ;\n" | ||
283 | " movq 48(%1), %%mm6 ;\n" | ||
284 | " movq %%mm4, 32(%1) ;\n" | ||
285 | " pxor 40(%2), %%mm5 ;\n" | ||
286 | " movq 56(%1), %%mm7 ;\n" | ||
287 | " movq %%mm5, 40(%1) ;\n" | ||
288 | " pxor 48(%2), %%mm6 ;\n" | ||
289 | " pxor 56(%2), %%mm7 ;\n" | ||
290 | " movq %%mm6, 48(%1) ;\n" | ||
291 | " movq %%mm7, 56(%1) ;\n" | ||
292 | |||
293 | " addl $64, %1 ;\n" | ||
294 | " addl $64, %2 ;\n" | ||
295 | " decl %0 ;\n" | ||
296 | " jnz 1b ;\n" | ||
297 | : "+r" (lines), | ||
298 | "+r" (p1), "+r" (p2) | ||
299 | : | ||
300 | : "memory"); | ||
301 | |||
302 | kernel_fpu_end(); | ||
303 | } | ||
304 | |||
305 | static void | ||
306 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
307 | unsigned long *p3) | ||
308 | { | ||
309 | unsigned long lines = bytes >> 6; | ||
310 | |||
311 | kernel_fpu_begin(); | ||
312 | |||
313 | asm volatile( | ||
314 | " .align 32,0x90 ;\n" | ||
315 | " 1: ;\n" | ||
316 | " movq (%1), %%mm0 ;\n" | ||
317 | " movq 8(%1), %%mm1 ;\n" | ||
318 | " pxor (%2), %%mm0 ;\n" | ||
319 | " movq 16(%1), %%mm2 ;\n" | ||
320 | " pxor 8(%2), %%mm1 ;\n" | ||
321 | " pxor (%3), %%mm0 ;\n" | ||
322 | " pxor 16(%2), %%mm2 ;\n" | ||
323 | " movq %%mm0, (%1) ;\n" | ||
324 | " pxor 8(%3), %%mm1 ;\n" | ||
325 | " pxor 16(%3), %%mm2 ;\n" | ||
326 | " movq 24(%1), %%mm3 ;\n" | ||
327 | " movq %%mm1, 8(%1) ;\n" | ||
328 | " movq 32(%1), %%mm4 ;\n" | ||
329 | " movq 40(%1), %%mm5 ;\n" | ||
330 | " pxor 24(%2), %%mm3 ;\n" | ||
331 | " movq %%mm2, 16(%1) ;\n" | ||
332 | " pxor 32(%2), %%mm4 ;\n" | ||
333 | " pxor 24(%3), %%mm3 ;\n" | ||
334 | " pxor 40(%2), %%mm5 ;\n" | ||
335 | " movq %%mm3, 24(%1) ;\n" | ||
336 | " pxor 32(%3), %%mm4 ;\n" | ||
337 | " pxor 40(%3), %%mm5 ;\n" | ||
338 | " movq 48(%1), %%mm6 ;\n" | ||
339 | " movq %%mm4, 32(%1) ;\n" | ||
340 | " movq 56(%1), %%mm7 ;\n" | ||
341 | " pxor 48(%2), %%mm6 ;\n" | ||
342 | " movq %%mm5, 40(%1) ;\n" | ||
343 | " pxor 56(%2), %%mm7 ;\n" | ||
344 | " pxor 48(%3), %%mm6 ;\n" | ||
345 | " pxor 56(%3), %%mm7 ;\n" | ||
346 | " movq %%mm6, 48(%1) ;\n" | ||
347 | " movq %%mm7, 56(%1) ;\n" | ||
348 | |||
349 | " addl $64, %1 ;\n" | ||
350 | " addl $64, %2 ;\n" | ||
351 | " addl $64, %3 ;\n" | ||
352 | " decl %0 ;\n" | ||
353 | " jnz 1b ;\n" | ||
354 | : "+r" (lines), | ||
355 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
356 | : | ||
357 | : "memory" ); | ||
358 | |||
359 | kernel_fpu_end(); | ||
360 | } | ||
361 | |||
362 | static void | ||
363 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
364 | unsigned long *p3, unsigned long *p4) | ||
365 | { | ||
366 | unsigned long lines = bytes >> 6; | ||
367 | |||
368 | kernel_fpu_begin(); | ||
369 | |||
370 | asm volatile( | ||
371 | " .align 32,0x90 ;\n" | ||
372 | " 1: ;\n" | ||
373 | " movq (%1), %%mm0 ;\n" | ||
374 | " movq 8(%1), %%mm1 ;\n" | ||
375 | " pxor (%2), %%mm0 ;\n" | ||
376 | " movq 16(%1), %%mm2 ;\n" | ||
377 | " pxor 8(%2), %%mm1 ;\n" | ||
378 | " pxor (%3), %%mm0 ;\n" | ||
379 | " pxor 16(%2), %%mm2 ;\n" | ||
380 | " pxor 8(%3), %%mm1 ;\n" | ||
381 | " pxor (%4), %%mm0 ;\n" | ||
382 | " movq 24(%1), %%mm3 ;\n" | ||
383 | " pxor 16(%3), %%mm2 ;\n" | ||
384 | " pxor 8(%4), %%mm1 ;\n" | ||
385 | " movq %%mm0, (%1) ;\n" | ||
386 | " movq 32(%1), %%mm4 ;\n" | ||
387 | " pxor 24(%2), %%mm3 ;\n" | ||
388 | " pxor 16(%4), %%mm2 ;\n" | ||
389 | " movq %%mm1, 8(%1) ;\n" | ||
390 | " movq 40(%1), %%mm5 ;\n" | ||
391 | " pxor 32(%2), %%mm4 ;\n" | ||
392 | " pxor 24(%3), %%mm3 ;\n" | ||
393 | " movq %%mm2, 16(%1) ;\n" | ||
394 | " pxor 40(%2), %%mm5 ;\n" | ||
395 | " pxor 32(%3), %%mm4 ;\n" | ||
396 | " pxor 24(%4), %%mm3 ;\n" | ||
397 | " movq %%mm3, 24(%1) ;\n" | ||
398 | " movq 56(%1), %%mm7 ;\n" | ||
399 | " movq 48(%1), %%mm6 ;\n" | ||
400 | " pxor 40(%3), %%mm5 ;\n" | ||
401 | " pxor 32(%4), %%mm4 ;\n" | ||
402 | " pxor 48(%2), %%mm6 ;\n" | ||
403 | " movq %%mm4, 32(%1) ;\n" | ||
404 | " pxor 56(%2), %%mm7 ;\n" | ||
405 | " pxor 40(%4), %%mm5 ;\n" | ||
406 | " pxor 48(%3), %%mm6 ;\n" | ||
407 | " pxor 56(%3), %%mm7 ;\n" | ||
408 | " movq %%mm5, 40(%1) ;\n" | ||
409 | " pxor 48(%4), %%mm6 ;\n" | ||
410 | " pxor 56(%4), %%mm7 ;\n" | ||
411 | " movq %%mm6, 48(%1) ;\n" | ||
412 | " movq %%mm7, 56(%1) ;\n" | ||
413 | |||
414 | " addl $64, %1 ;\n" | ||
415 | " addl $64, %2 ;\n" | ||
416 | " addl $64, %3 ;\n" | ||
417 | " addl $64, %4 ;\n" | ||
418 | " decl %0 ;\n" | ||
419 | " jnz 1b ;\n" | ||
420 | : "+r" (lines), | ||
421 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
422 | : | ||
423 | : "memory"); | ||
424 | |||
425 | kernel_fpu_end(); | ||
426 | } | ||
427 | |||
428 | static void | ||
429 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
430 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
431 | { | ||
432 | unsigned long lines = bytes >> 6; | ||
433 | |||
434 | kernel_fpu_begin(); | ||
435 | |||
436 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
437 | such that it won't pass to the asm volatile below a | ||
438 | register that is shared with any other variable. That's | ||
439 | because we modify p4 and p5 there, but we can't mark them | ||
440 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
441 | limit of GCC < 3.1. */ | ||
442 | asm("" : "+r" (p4), "+r" (p5)); | ||
443 | |||
444 | asm volatile( | ||
445 | " .align 32,0x90 ;\n" | ||
446 | " 1: ;\n" | ||
447 | " movq (%1), %%mm0 ;\n" | ||
448 | " movq 8(%1), %%mm1 ;\n" | ||
449 | " pxor (%2), %%mm0 ;\n" | ||
450 | " pxor 8(%2), %%mm1 ;\n" | ||
451 | " movq 16(%1), %%mm2 ;\n" | ||
452 | " pxor (%3), %%mm0 ;\n" | ||
453 | " pxor 8(%3), %%mm1 ;\n" | ||
454 | " pxor 16(%2), %%mm2 ;\n" | ||
455 | " pxor (%4), %%mm0 ;\n" | ||
456 | " pxor 8(%4), %%mm1 ;\n" | ||
457 | " pxor 16(%3), %%mm2 ;\n" | ||
458 | " movq 24(%1), %%mm3 ;\n" | ||
459 | " pxor (%5), %%mm0 ;\n" | ||
460 | " pxor 8(%5), %%mm1 ;\n" | ||
461 | " movq %%mm0, (%1) ;\n" | ||
462 | " pxor 16(%4), %%mm2 ;\n" | ||
463 | " pxor 24(%2), %%mm3 ;\n" | ||
464 | " movq %%mm1, 8(%1) ;\n" | ||
465 | " pxor 16(%5), %%mm2 ;\n" | ||
466 | " pxor 24(%3), %%mm3 ;\n" | ||
467 | " movq 32(%1), %%mm4 ;\n" | ||
468 | " movq %%mm2, 16(%1) ;\n" | ||
469 | " pxor 24(%4), %%mm3 ;\n" | ||
470 | " pxor 32(%2), %%mm4 ;\n" | ||
471 | " movq 40(%1), %%mm5 ;\n" | ||
472 | " pxor 24(%5), %%mm3 ;\n" | ||
473 | " pxor 32(%3), %%mm4 ;\n" | ||
474 | " pxor 40(%2), %%mm5 ;\n" | ||
475 | " movq %%mm3, 24(%1) ;\n" | ||
476 | " pxor 32(%4), %%mm4 ;\n" | ||
477 | " pxor 40(%3), %%mm5 ;\n" | ||
478 | " movq 48(%1), %%mm6 ;\n" | ||
479 | " movq 56(%1), %%mm7 ;\n" | ||
480 | " pxor 32(%5), %%mm4 ;\n" | ||
481 | " pxor 40(%4), %%mm5 ;\n" | ||
482 | " pxor 48(%2), %%mm6 ;\n" | ||
483 | " pxor 56(%2), %%mm7 ;\n" | ||
484 | " movq %%mm4, 32(%1) ;\n" | ||
485 | " pxor 48(%3), %%mm6 ;\n" | ||
486 | " pxor 56(%3), %%mm7 ;\n" | ||
487 | " pxor 40(%5), %%mm5 ;\n" | ||
488 | " pxor 48(%4), %%mm6 ;\n" | ||
489 | " pxor 56(%4), %%mm7 ;\n" | ||
490 | " movq %%mm5, 40(%1) ;\n" | ||
491 | " pxor 48(%5), %%mm6 ;\n" | ||
492 | " pxor 56(%5), %%mm7 ;\n" | ||
493 | " movq %%mm6, 48(%1) ;\n" | ||
494 | " movq %%mm7, 56(%1) ;\n" | ||
495 | |||
496 | " addl $64, %1 ;\n" | ||
497 | " addl $64, %2 ;\n" | ||
498 | " addl $64, %3 ;\n" | ||
499 | " addl $64, %4 ;\n" | ||
500 | " addl $64, %5 ;\n" | ||
501 | " decl %0 ;\n" | ||
502 | " jnz 1b ;\n" | ||
503 | : "+r" (lines), | ||
504 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
505 | : "r" (p4), "r" (p5) | ||
506 | : "memory"); | ||
507 | |||
508 | /* p4 and p5 were modified, and now the variables are dead. | ||
509 | Clobber them just to be sure nobody does something stupid | ||
510 | like assuming they have some legal value. */ | ||
511 | asm("" : "=r" (p4), "=r" (p5)); | ||
512 | |||
513 | kernel_fpu_end(); | ||
514 | } | ||
515 | |||
516 | static struct xor_block_template xor_block_pII_mmx = { | ||
517 | .name = "pII_mmx", | ||
518 | .do_2 = xor_pII_mmx_2, | ||
519 | .do_3 = xor_pII_mmx_3, | ||
520 | .do_4 = xor_pII_mmx_4, | ||
521 | .do_5 = xor_pII_mmx_5, | ||
522 | }; | ||
523 | |||
524 | static struct xor_block_template xor_block_p5_mmx = { | ||
525 | .name = "p5_mmx", | ||
526 | .do_2 = xor_p5_mmx_2, | ||
527 | .do_3 = xor_p5_mmx_3, | ||
528 | .do_4 = xor_p5_mmx_4, | ||
529 | .do_5 = xor_p5_mmx_5, | ||
530 | }; | ||
531 | |||
532 | /* | ||
533 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
534 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
535 | */ | ||
536 | |||
537 | #define XMMS_SAVE \ | ||
538 | do { \ | ||
539 | preempt_disable(); \ | ||
540 | cr0 = read_cr0(); \ | ||
541 | clts(); \ | ||
542 | asm volatile( \ | ||
543 | "movups %%xmm0,(%0) ;\n\t" \ | ||
544 | "movups %%xmm1,0x10(%0) ;\n\t" \ | ||
545 | "movups %%xmm2,0x20(%0) ;\n\t" \ | ||
546 | "movups %%xmm3,0x30(%0) ;\n\t" \ | ||
547 | : \ | ||
548 | : "r" (xmm_save) \ | ||
549 | : "memory"); \ | ||
550 | } while (0) | ||
551 | |||
552 | #define XMMS_RESTORE \ | ||
553 | do { \ | ||
554 | asm volatile( \ | ||
555 | "sfence ;\n\t" \ | ||
556 | "movups (%0),%%xmm0 ;\n\t" \ | ||
557 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | ||
558 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | ||
559 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | ||
560 | : \ | ||
561 | : "r" (xmm_save) \ | ||
562 | : "memory"); \ | ||
563 | write_cr0(cr0); \ | ||
564 | preempt_enable(); \ | ||
565 | } while (0) | ||
566 | |||
567 | #define ALIGN16 __attribute__((aligned(16))) | ||
568 | |||
569 | #define OFFS(x) "16*("#x")" | ||
570 | #define PF_OFFS(x) "256+16*("#x")" | ||
571 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | ||
572 | #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | ||
573 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | ||
574 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | ||
575 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | ||
576 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | ||
577 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | ||
578 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | ||
579 | #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | ||
580 | #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | ||
581 | #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | ||
582 | #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | ||
583 | #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | ||
584 | |||
585 | |||
586 | static void | ||
587 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
588 | { | ||
589 | unsigned long lines = bytes >> 8; | ||
590 | char xmm_save[16*4] ALIGN16; | ||
591 | int cr0; | ||
592 | |||
593 | XMMS_SAVE; | ||
594 | |||
595 | asm volatile( | ||
596 | #undef BLOCK | ||
597 | #define BLOCK(i) \ | ||
598 | LD(i, 0) \ | ||
599 | LD(i + 1, 1) \ | ||
600 | PF1(i) \ | ||
601 | PF1(i + 2) \ | ||
602 | LD(i + 2, 2) \ | ||
603 | LD(i + 3, 3) \ | ||
604 | PF0(i + 4) \ | ||
605 | PF0(i + 6) \ | ||
606 | XO1(i, 0) \ | ||
607 | XO1(i + 1, 1) \ | ||
608 | XO1(i + 2, 2) \ | ||
609 | XO1(i + 3, 3) \ | ||
610 | ST(i, 0) \ | ||
611 | ST(i + 1, 1) \ | ||
612 | ST(i + 2, 2) \ | ||
613 | ST(i + 3, 3) \ | ||
614 | |||
615 | |||
616 | PF0(0) | ||
617 | PF0(2) | ||
618 | |||
619 | " .align 32 ;\n" | ||
620 | " 1: ;\n" | ||
621 | |||
622 | BLOCK(0) | ||
623 | BLOCK(4) | ||
624 | BLOCK(8) | ||
625 | BLOCK(12) | ||
626 | |||
627 | " addl $256, %1 ;\n" | ||
628 | " addl $256, %2 ;\n" | ||
629 | " decl %0 ;\n" | ||
630 | " jnz 1b ;\n" | ||
631 | : "+r" (lines), | ||
632 | "+r" (p1), "+r" (p2) | ||
633 | : | ||
634 | : "memory"); | ||
635 | |||
636 | XMMS_RESTORE; | ||
637 | } | ||
638 | |||
639 | static void | ||
640 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
641 | unsigned long *p3) | ||
642 | { | ||
643 | unsigned long lines = bytes >> 8; | ||
644 | char xmm_save[16*4] ALIGN16; | ||
645 | int cr0; | ||
646 | |||
647 | XMMS_SAVE; | ||
648 | |||
649 | asm volatile( | ||
650 | #undef BLOCK | ||
651 | #define BLOCK(i) \ | ||
652 | PF1(i) \ | ||
653 | PF1(i + 2) \ | ||
654 | LD(i,0) \ | ||
655 | LD(i + 1, 1) \ | ||
656 | LD(i + 2, 2) \ | ||
657 | LD(i + 3, 3) \ | ||
658 | PF2(i) \ | ||
659 | PF2(i + 2) \ | ||
660 | PF0(i + 4) \ | ||
661 | PF0(i + 6) \ | ||
662 | XO1(i,0) \ | ||
663 | XO1(i + 1, 1) \ | ||
664 | XO1(i + 2, 2) \ | ||
665 | XO1(i + 3, 3) \ | ||
666 | XO2(i,0) \ | ||
667 | XO2(i + 1, 1) \ | ||
668 | XO2(i + 2, 2) \ | ||
669 | XO2(i + 3, 3) \ | ||
670 | ST(i,0) \ | ||
671 | ST(i + 1, 1) \ | ||
672 | ST(i + 2, 2) \ | ||
673 | ST(i + 3, 3) \ | ||
674 | |||
675 | |||
676 | PF0(0) | ||
677 | PF0(2) | ||
678 | |||
679 | " .align 32 ;\n" | ||
680 | " 1: ;\n" | ||
681 | |||
682 | BLOCK(0) | ||
683 | BLOCK(4) | ||
684 | BLOCK(8) | ||
685 | BLOCK(12) | ||
686 | |||
687 | " addl $256, %1 ;\n" | ||
688 | " addl $256, %2 ;\n" | ||
689 | " addl $256, %3 ;\n" | ||
690 | " decl %0 ;\n" | ||
691 | " jnz 1b ;\n" | ||
692 | : "+r" (lines), | ||
693 | "+r" (p1), "+r"(p2), "+r"(p3) | ||
694 | : | ||
695 | : "memory" ); | ||
696 | |||
697 | XMMS_RESTORE; | ||
698 | } | ||
699 | |||
700 | static void | ||
701 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
702 | unsigned long *p3, unsigned long *p4) | ||
703 | { | ||
704 | unsigned long lines = bytes >> 8; | ||
705 | char xmm_save[16*4] ALIGN16; | ||
706 | int cr0; | ||
707 | |||
708 | XMMS_SAVE; | ||
709 | |||
710 | asm volatile( | ||
711 | #undef BLOCK | ||
712 | #define BLOCK(i) \ | ||
713 | PF1(i) \ | ||
714 | PF1(i + 2) \ | ||
715 | LD(i,0) \ | ||
716 | LD(i + 1, 1) \ | ||
717 | LD(i + 2, 2) \ | ||
718 | LD(i + 3, 3) \ | ||
719 | PF2(i) \ | ||
720 | PF2(i + 2) \ | ||
721 | XO1(i,0) \ | ||
722 | XO1(i + 1, 1) \ | ||
723 | XO1(i + 2, 2) \ | ||
724 | XO1(i + 3, 3) \ | ||
725 | PF3(i) \ | ||
726 | PF3(i + 2) \ | ||
727 | PF0(i + 4) \ | ||
728 | PF0(i + 6) \ | ||
729 | XO2(i,0) \ | ||
730 | XO2(i + 1, 1) \ | ||
731 | XO2(i + 2, 2) \ | ||
732 | XO2(i + 3, 3) \ | ||
733 | XO3(i,0) \ | ||
734 | XO3(i + 1, 1) \ | ||
735 | XO3(i + 2, 2) \ | ||
736 | XO3(i + 3, 3) \ | ||
737 | ST(i,0) \ | ||
738 | ST(i + 1, 1) \ | ||
739 | ST(i + 2, 2) \ | ||
740 | ST(i + 3, 3) \ | ||
741 | |||
742 | |||
743 | PF0(0) | ||
744 | PF0(2) | ||
745 | |||
746 | " .align 32 ;\n" | ||
747 | " 1: ;\n" | ||
748 | |||
749 | BLOCK(0) | ||
750 | BLOCK(4) | ||
751 | BLOCK(8) | ||
752 | BLOCK(12) | ||
753 | |||
754 | " addl $256, %1 ;\n" | ||
755 | " addl $256, %2 ;\n" | ||
756 | " addl $256, %3 ;\n" | ||
757 | " addl $256, %4 ;\n" | ||
758 | " decl %0 ;\n" | ||
759 | " jnz 1b ;\n" | ||
760 | : "+r" (lines), | ||
761 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
762 | : | ||
763 | : "memory" ); | ||
764 | |||
765 | XMMS_RESTORE; | ||
766 | } | ||
767 | |||
768 | static void | ||
769 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
770 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
771 | { | ||
772 | unsigned long lines = bytes >> 8; | ||
773 | char xmm_save[16*4] ALIGN16; | ||
774 | int cr0; | ||
775 | |||
776 | XMMS_SAVE; | ||
777 | |||
778 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
779 | such that it won't pass to the asm volatile below a | ||
780 | register that is shared with any other variable. That's | ||
781 | because we modify p4 and p5 there, but we can't mark them | ||
782 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
783 | limit of GCC < 3.1. */ | ||
784 | asm("" : "+r" (p4), "+r" (p5)); | ||
785 | |||
786 | asm volatile( | ||
787 | #undef BLOCK | ||
788 | #define BLOCK(i) \ | ||
789 | PF1(i) \ | ||
790 | PF1(i + 2) \ | ||
791 | LD(i,0) \ | ||
792 | LD(i + 1, 1) \ | ||
793 | LD(i + 2, 2) \ | ||
794 | LD(i + 3, 3) \ | ||
795 | PF2(i) \ | ||
796 | PF2(i + 2) \ | ||
797 | XO1(i,0) \ | ||
798 | XO1(i + 1, 1) \ | ||
799 | XO1(i + 2, 2) \ | ||
800 | XO1(i + 3, 3) \ | ||
801 | PF3(i) \ | ||
802 | PF3(i + 2) \ | ||
803 | XO2(i,0) \ | ||
804 | XO2(i + 1, 1) \ | ||
805 | XO2(i + 2, 2) \ | ||
806 | XO2(i + 3, 3) \ | ||
807 | PF4(i) \ | ||
808 | PF4(i + 2) \ | ||
809 | PF0(i + 4) \ | ||
810 | PF0(i + 6) \ | ||
811 | XO3(i,0) \ | ||
812 | XO3(i + 1, 1) \ | ||
813 | XO3(i + 2, 2) \ | ||
814 | XO3(i + 3, 3) \ | ||
815 | XO4(i,0) \ | ||
816 | XO4(i + 1, 1) \ | ||
817 | XO4(i + 2, 2) \ | ||
818 | XO4(i + 3, 3) \ | ||
819 | ST(i,0) \ | ||
820 | ST(i + 1, 1) \ | ||
821 | ST(i + 2, 2) \ | ||
822 | ST(i + 3, 3) \ | ||
823 | |||
824 | |||
825 | PF0(0) | ||
826 | PF0(2) | ||
827 | |||
828 | " .align 32 ;\n" | ||
829 | " 1: ;\n" | ||
830 | |||
831 | BLOCK(0) | ||
832 | BLOCK(4) | ||
833 | BLOCK(8) | ||
834 | BLOCK(12) | ||
835 | |||
836 | " addl $256, %1 ;\n" | ||
837 | " addl $256, %2 ;\n" | ||
838 | " addl $256, %3 ;\n" | ||
839 | " addl $256, %4 ;\n" | ||
840 | " addl $256, %5 ;\n" | ||
841 | " decl %0 ;\n" | ||
842 | " jnz 1b ;\n" | ||
843 | : "+r" (lines), | ||
844 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
845 | : "r" (p4), "r" (p5) | ||
846 | : "memory"); | ||
847 | |||
848 | /* p4 and p5 were modified, and now the variables are dead. | ||
849 | Clobber them just to be sure nobody does something stupid | ||
850 | like assuming they have some legal value. */ | ||
851 | asm("" : "=r" (p4), "=r" (p5)); | ||
852 | |||
853 | XMMS_RESTORE; | ||
854 | } | ||
855 | |||
856 | static struct xor_block_template xor_block_pIII_sse = { | ||
857 | .name = "pIII_sse", | ||
858 | .do_2 = xor_sse_2, | ||
859 | .do_3 = xor_sse_3, | ||
860 | .do_4 = xor_sse_4, | ||
861 | .do_5 = xor_sse_5, | ||
862 | }; | ||
863 | |||
864 | /* Also try the generic routines. */ | ||
865 | #include <asm-generic/xor.h> | ||
866 | |||
867 | #undef XOR_TRY_TEMPLATES | ||
868 | #define XOR_TRY_TEMPLATES \ | ||
869 | do { \ | ||
870 | xor_speed(&xor_block_8regs); \ | ||
871 | xor_speed(&xor_block_8regs_p); \ | ||
872 | xor_speed(&xor_block_32regs); \ | ||
873 | xor_speed(&xor_block_32regs_p); \ | ||
874 | if (cpu_has_xmm) \ | ||
875 | xor_speed(&xor_block_pIII_sse); \ | ||
876 | if (cpu_has_mmx) { \ | ||
877 | xor_speed(&xor_block_pII_mmx); \ | ||
878 | xor_speed(&xor_block_p5_mmx); \ | ||
879 | } \ | ||
880 | } while (0) | ||
881 | |||
882 | /* We force the use of the SSE xor block because it can write around L2. | ||
883 | We may also be able to load into the L1 only depending on how the cpu | ||
884 | deals with a load to a line that is being prefetched. */ | ||
885 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
886 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | ||
887 | |||
888 | #endif /* ASM_X86__XOR_32_H */ | ||