diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /include/asm-i386/xor.h |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'include/asm-i386/xor.h')
-rw-r--r-- | include/asm-i386/xor.h | 883 |
1 files changed, 883 insertions, 0 deletions
diff --git a/include/asm-i386/xor.h b/include/asm-i386/xor.h new file mode 100644 index 000000000000..f80e2dbe1b56 --- /dev/null +++ b/include/asm-i386/xor.h | |||
@@ -0,0 +1,883 @@ | |||
1 | /* | ||
2 | * include/asm-i386/xor.h | ||
3 | * | ||
4 | * Optimized RAID-5 checksumming functions for MMX and SSE. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2, or (at your option) | ||
9 | * any later version. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
13 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * High-speed RAID5 checksumming functions utilizing MMX instructions. | ||
18 | * Copyright (C) 1998 Ingo Molnar. | ||
19 | */ | ||
20 | |||
21 | #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" | ||
22 | #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | ||
23 | #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | ||
24 | #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | ||
25 | #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | ||
26 | #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | ||
27 | |||
28 | #include <asm/i387.h> | ||
29 | |||
30 | static void | ||
31 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
32 | { | ||
33 | unsigned long lines = bytes >> 7; | ||
34 | |||
35 | kernel_fpu_begin(); | ||
36 | |||
37 | __asm__ __volatile__ ( | ||
38 | #undef BLOCK | ||
39 | #define BLOCK(i) \ | ||
40 | LD(i,0) \ | ||
41 | LD(i+1,1) \ | ||
42 | LD(i+2,2) \ | ||
43 | LD(i+3,3) \ | ||
44 | XO1(i,0) \ | ||
45 | ST(i,0) \ | ||
46 | XO1(i+1,1) \ | ||
47 | ST(i+1,1) \ | ||
48 | XO1(i+2,2) \ | ||
49 | ST(i+2,2) \ | ||
50 | XO1(i+3,3) \ | ||
51 | ST(i+3,3) | ||
52 | |||
53 | " .align 32 ;\n" | ||
54 | " 1: ;\n" | ||
55 | |||
56 | BLOCK(0) | ||
57 | BLOCK(4) | ||
58 | BLOCK(8) | ||
59 | BLOCK(12) | ||
60 | |||
61 | " addl $128, %1 ;\n" | ||
62 | " addl $128, %2 ;\n" | ||
63 | " decl %0 ;\n" | ||
64 | " jnz 1b ;\n" | ||
65 | : "+r" (lines), | ||
66 | "+r" (p1), "+r" (p2) | ||
67 | : | ||
68 | : "memory"); | ||
69 | |||
70 | kernel_fpu_end(); | ||
71 | } | ||
72 | |||
73 | static void | ||
74 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
75 | unsigned long *p3) | ||
76 | { | ||
77 | unsigned long lines = bytes >> 7; | ||
78 | |||
79 | kernel_fpu_begin(); | ||
80 | |||
81 | __asm__ __volatile__ ( | ||
82 | #undef BLOCK | ||
83 | #define BLOCK(i) \ | ||
84 | LD(i,0) \ | ||
85 | LD(i+1,1) \ | ||
86 | LD(i+2,2) \ | ||
87 | LD(i+3,3) \ | ||
88 | XO1(i,0) \ | ||
89 | XO1(i+1,1) \ | ||
90 | XO1(i+2,2) \ | ||
91 | XO1(i+3,3) \ | ||
92 | XO2(i,0) \ | ||
93 | ST(i,0) \ | ||
94 | XO2(i+1,1) \ | ||
95 | ST(i+1,1) \ | ||
96 | XO2(i+2,2) \ | ||
97 | ST(i+2,2) \ | ||
98 | XO2(i+3,3) \ | ||
99 | ST(i+3,3) | ||
100 | |||
101 | " .align 32 ;\n" | ||
102 | " 1: ;\n" | ||
103 | |||
104 | BLOCK(0) | ||
105 | BLOCK(4) | ||
106 | BLOCK(8) | ||
107 | BLOCK(12) | ||
108 | |||
109 | " addl $128, %1 ;\n" | ||
110 | " addl $128, %2 ;\n" | ||
111 | " addl $128, %3 ;\n" | ||
112 | " decl %0 ;\n" | ||
113 | " jnz 1b ;\n" | ||
114 | : "+r" (lines), | ||
115 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
116 | : | ||
117 | : "memory"); | ||
118 | |||
119 | kernel_fpu_end(); | ||
120 | } | ||
121 | |||
122 | static void | ||
123 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
124 | unsigned long *p3, unsigned long *p4) | ||
125 | { | ||
126 | unsigned long lines = bytes >> 7; | ||
127 | |||
128 | kernel_fpu_begin(); | ||
129 | |||
130 | __asm__ __volatile__ ( | ||
131 | #undef BLOCK | ||
132 | #define BLOCK(i) \ | ||
133 | LD(i,0) \ | ||
134 | LD(i+1,1) \ | ||
135 | LD(i+2,2) \ | ||
136 | LD(i+3,3) \ | ||
137 | XO1(i,0) \ | ||
138 | XO1(i+1,1) \ | ||
139 | XO1(i+2,2) \ | ||
140 | XO1(i+3,3) \ | ||
141 | XO2(i,0) \ | ||
142 | XO2(i+1,1) \ | ||
143 | XO2(i+2,2) \ | ||
144 | XO2(i+3,3) \ | ||
145 | XO3(i,0) \ | ||
146 | ST(i,0) \ | ||
147 | XO3(i+1,1) \ | ||
148 | ST(i+1,1) \ | ||
149 | XO3(i+2,2) \ | ||
150 | ST(i+2,2) \ | ||
151 | XO3(i+3,3) \ | ||
152 | ST(i+3,3) | ||
153 | |||
154 | " .align 32 ;\n" | ||
155 | " 1: ;\n" | ||
156 | |||
157 | BLOCK(0) | ||
158 | BLOCK(4) | ||
159 | BLOCK(8) | ||
160 | BLOCK(12) | ||
161 | |||
162 | " addl $128, %1 ;\n" | ||
163 | " addl $128, %2 ;\n" | ||
164 | " addl $128, %3 ;\n" | ||
165 | " addl $128, %4 ;\n" | ||
166 | " decl %0 ;\n" | ||
167 | " jnz 1b ;\n" | ||
168 | : "+r" (lines), | ||
169 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
170 | : | ||
171 | : "memory"); | ||
172 | |||
173 | kernel_fpu_end(); | ||
174 | } | ||
175 | |||
176 | |||
177 | static void | ||
178 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
179 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
180 | { | ||
181 | unsigned long lines = bytes >> 7; | ||
182 | |||
183 | kernel_fpu_begin(); | ||
184 | |||
185 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
186 | such that it won't pass to the asm volatile below a | ||
187 | register that is shared with any other variable. That's | ||
188 | because we modify p4 and p5 there, but we can't mark them | ||
189 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
190 | limit of GCC < 3.1. */ | ||
191 | __asm__ ("" : "+r" (p4), "+r" (p5)); | ||
192 | |||
193 | __asm__ __volatile__ ( | ||
194 | #undef BLOCK | ||
195 | #define BLOCK(i) \ | ||
196 | LD(i,0) \ | ||
197 | LD(i+1,1) \ | ||
198 | LD(i+2,2) \ | ||
199 | LD(i+3,3) \ | ||
200 | XO1(i,0) \ | ||
201 | XO1(i+1,1) \ | ||
202 | XO1(i+2,2) \ | ||
203 | XO1(i+3,3) \ | ||
204 | XO2(i,0) \ | ||
205 | XO2(i+1,1) \ | ||
206 | XO2(i+2,2) \ | ||
207 | XO2(i+3,3) \ | ||
208 | XO3(i,0) \ | ||
209 | XO3(i+1,1) \ | ||
210 | XO3(i+2,2) \ | ||
211 | XO3(i+3,3) \ | ||
212 | XO4(i,0) \ | ||
213 | ST(i,0) \ | ||
214 | XO4(i+1,1) \ | ||
215 | ST(i+1,1) \ | ||
216 | XO4(i+2,2) \ | ||
217 | ST(i+2,2) \ | ||
218 | XO4(i+3,3) \ | ||
219 | ST(i+3,3) | ||
220 | |||
221 | " .align 32 ;\n" | ||
222 | " 1: ;\n" | ||
223 | |||
224 | BLOCK(0) | ||
225 | BLOCK(4) | ||
226 | BLOCK(8) | ||
227 | BLOCK(12) | ||
228 | |||
229 | " addl $128, %1 ;\n" | ||
230 | " addl $128, %2 ;\n" | ||
231 | " addl $128, %3 ;\n" | ||
232 | " addl $128, %4 ;\n" | ||
233 | " addl $128, %5 ;\n" | ||
234 | " decl %0 ;\n" | ||
235 | " jnz 1b ;\n" | ||
236 | : "+r" (lines), | ||
237 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
238 | : "r" (p4), "r" (p5) | ||
239 | : "memory"); | ||
240 | |||
241 | /* p4 and p5 were modified, and now the variables are dead. | ||
242 | Clobber them just to be sure nobody does something stupid | ||
243 | like assuming they have some legal value. */ | ||
244 | __asm__ ("" : "=r" (p4), "=r" (p5)); | ||
245 | |||
246 | kernel_fpu_end(); | ||
247 | } | ||
248 | |||
249 | #undef LD | ||
250 | #undef XO1 | ||
251 | #undef XO2 | ||
252 | #undef XO3 | ||
253 | #undef XO4 | ||
254 | #undef ST | ||
255 | #undef BLOCK | ||
256 | |||
257 | static void | ||
258 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
259 | { | ||
260 | unsigned long lines = bytes >> 6; | ||
261 | |||
262 | kernel_fpu_begin(); | ||
263 | |||
264 | __asm__ __volatile__ ( | ||
265 | " .align 32 ;\n" | ||
266 | " 1: ;\n" | ||
267 | " movq (%1), %%mm0 ;\n" | ||
268 | " movq 8(%1), %%mm1 ;\n" | ||
269 | " pxor (%2), %%mm0 ;\n" | ||
270 | " movq 16(%1), %%mm2 ;\n" | ||
271 | " movq %%mm0, (%1) ;\n" | ||
272 | " pxor 8(%2), %%mm1 ;\n" | ||
273 | " movq 24(%1), %%mm3 ;\n" | ||
274 | " movq %%mm1, 8(%1) ;\n" | ||
275 | " pxor 16(%2), %%mm2 ;\n" | ||
276 | " movq 32(%1), %%mm4 ;\n" | ||
277 | " movq %%mm2, 16(%1) ;\n" | ||
278 | " pxor 24(%2), %%mm3 ;\n" | ||
279 | " movq 40(%1), %%mm5 ;\n" | ||
280 | " movq %%mm3, 24(%1) ;\n" | ||
281 | " pxor 32(%2), %%mm4 ;\n" | ||
282 | " movq 48(%1), %%mm6 ;\n" | ||
283 | " movq %%mm4, 32(%1) ;\n" | ||
284 | " pxor 40(%2), %%mm5 ;\n" | ||
285 | " movq 56(%1), %%mm7 ;\n" | ||
286 | " movq %%mm5, 40(%1) ;\n" | ||
287 | " pxor 48(%2), %%mm6 ;\n" | ||
288 | " pxor 56(%2), %%mm7 ;\n" | ||
289 | " movq %%mm6, 48(%1) ;\n" | ||
290 | " movq %%mm7, 56(%1) ;\n" | ||
291 | |||
292 | " addl $64, %1 ;\n" | ||
293 | " addl $64, %2 ;\n" | ||
294 | " decl %0 ;\n" | ||
295 | " jnz 1b ;\n" | ||
296 | : "+r" (lines), | ||
297 | "+r" (p1), "+r" (p2) | ||
298 | : | ||
299 | : "memory"); | ||
300 | |||
301 | kernel_fpu_end(); | ||
302 | } | ||
303 | |||
304 | static void | ||
305 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
306 | unsigned long *p3) | ||
307 | { | ||
308 | unsigned long lines = bytes >> 6; | ||
309 | |||
310 | kernel_fpu_begin(); | ||
311 | |||
312 | __asm__ __volatile__ ( | ||
313 | " .align 32,0x90 ;\n" | ||
314 | " 1: ;\n" | ||
315 | " movq (%1), %%mm0 ;\n" | ||
316 | " movq 8(%1), %%mm1 ;\n" | ||
317 | " pxor (%2), %%mm0 ;\n" | ||
318 | " movq 16(%1), %%mm2 ;\n" | ||
319 | " pxor 8(%2), %%mm1 ;\n" | ||
320 | " pxor (%3), %%mm0 ;\n" | ||
321 | " pxor 16(%2), %%mm2 ;\n" | ||
322 | " movq %%mm0, (%1) ;\n" | ||
323 | " pxor 8(%3), %%mm1 ;\n" | ||
324 | " pxor 16(%3), %%mm2 ;\n" | ||
325 | " movq 24(%1), %%mm3 ;\n" | ||
326 | " movq %%mm1, 8(%1) ;\n" | ||
327 | " movq 32(%1), %%mm4 ;\n" | ||
328 | " movq 40(%1), %%mm5 ;\n" | ||
329 | " pxor 24(%2), %%mm3 ;\n" | ||
330 | " movq %%mm2, 16(%1) ;\n" | ||
331 | " pxor 32(%2), %%mm4 ;\n" | ||
332 | " pxor 24(%3), %%mm3 ;\n" | ||
333 | " pxor 40(%2), %%mm5 ;\n" | ||
334 | " movq %%mm3, 24(%1) ;\n" | ||
335 | " pxor 32(%3), %%mm4 ;\n" | ||
336 | " pxor 40(%3), %%mm5 ;\n" | ||
337 | " movq 48(%1), %%mm6 ;\n" | ||
338 | " movq %%mm4, 32(%1) ;\n" | ||
339 | " movq 56(%1), %%mm7 ;\n" | ||
340 | " pxor 48(%2), %%mm6 ;\n" | ||
341 | " movq %%mm5, 40(%1) ;\n" | ||
342 | " pxor 56(%2), %%mm7 ;\n" | ||
343 | " pxor 48(%3), %%mm6 ;\n" | ||
344 | " pxor 56(%3), %%mm7 ;\n" | ||
345 | " movq %%mm6, 48(%1) ;\n" | ||
346 | " movq %%mm7, 56(%1) ;\n" | ||
347 | |||
348 | " addl $64, %1 ;\n" | ||
349 | " addl $64, %2 ;\n" | ||
350 | " addl $64, %3 ;\n" | ||
351 | " decl %0 ;\n" | ||
352 | " jnz 1b ;\n" | ||
353 | : "+r" (lines), | ||
354 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
355 | : | ||
356 | : "memory" ); | ||
357 | |||
358 | kernel_fpu_end(); | ||
359 | } | ||
360 | |||
361 | static void | ||
362 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
363 | unsigned long *p3, unsigned long *p4) | ||
364 | { | ||
365 | unsigned long lines = bytes >> 6; | ||
366 | |||
367 | kernel_fpu_begin(); | ||
368 | |||
369 | __asm__ __volatile__ ( | ||
370 | " .align 32,0x90 ;\n" | ||
371 | " 1: ;\n" | ||
372 | " movq (%1), %%mm0 ;\n" | ||
373 | " movq 8(%1), %%mm1 ;\n" | ||
374 | " pxor (%2), %%mm0 ;\n" | ||
375 | " movq 16(%1), %%mm2 ;\n" | ||
376 | " pxor 8(%2), %%mm1 ;\n" | ||
377 | " pxor (%3), %%mm0 ;\n" | ||
378 | " pxor 16(%2), %%mm2 ;\n" | ||
379 | " pxor 8(%3), %%mm1 ;\n" | ||
380 | " pxor (%4), %%mm0 ;\n" | ||
381 | " movq 24(%1), %%mm3 ;\n" | ||
382 | " pxor 16(%3), %%mm2 ;\n" | ||
383 | " pxor 8(%4), %%mm1 ;\n" | ||
384 | " movq %%mm0, (%1) ;\n" | ||
385 | " movq 32(%1), %%mm4 ;\n" | ||
386 | " pxor 24(%2), %%mm3 ;\n" | ||
387 | " pxor 16(%4), %%mm2 ;\n" | ||
388 | " movq %%mm1, 8(%1) ;\n" | ||
389 | " movq 40(%1), %%mm5 ;\n" | ||
390 | " pxor 32(%2), %%mm4 ;\n" | ||
391 | " pxor 24(%3), %%mm3 ;\n" | ||
392 | " movq %%mm2, 16(%1) ;\n" | ||
393 | " pxor 40(%2), %%mm5 ;\n" | ||
394 | " pxor 32(%3), %%mm4 ;\n" | ||
395 | " pxor 24(%4), %%mm3 ;\n" | ||
396 | " movq %%mm3, 24(%1) ;\n" | ||
397 | " movq 56(%1), %%mm7 ;\n" | ||
398 | " movq 48(%1), %%mm6 ;\n" | ||
399 | " pxor 40(%3), %%mm5 ;\n" | ||
400 | " pxor 32(%4), %%mm4 ;\n" | ||
401 | " pxor 48(%2), %%mm6 ;\n" | ||
402 | " movq %%mm4, 32(%1) ;\n" | ||
403 | " pxor 56(%2), %%mm7 ;\n" | ||
404 | " pxor 40(%4), %%mm5 ;\n" | ||
405 | " pxor 48(%3), %%mm6 ;\n" | ||
406 | " pxor 56(%3), %%mm7 ;\n" | ||
407 | " movq %%mm5, 40(%1) ;\n" | ||
408 | " pxor 48(%4), %%mm6 ;\n" | ||
409 | " pxor 56(%4), %%mm7 ;\n" | ||
410 | " movq %%mm6, 48(%1) ;\n" | ||
411 | " movq %%mm7, 56(%1) ;\n" | ||
412 | |||
413 | " addl $64, %1 ;\n" | ||
414 | " addl $64, %2 ;\n" | ||
415 | " addl $64, %3 ;\n" | ||
416 | " addl $64, %4 ;\n" | ||
417 | " decl %0 ;\n" | ||
418 | " jnz 1b ;\n" | ||
419 | : "+r" (lines), | ||
420 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
421 | : | ||
422 | : "memory"); | ||
423 | |||
424 | kernel_fpu_end(); | ||
425 | } | ||
426 | |||
427 | static void | ||
428 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
429 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
430 | { | ||
431 | unsigned long lines = bytes >> 6; | ||
432 | |||
433 | kernel_fpu_begin(); | ||
434 | |||
435 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
436 | such that it won't pass to the asm volatile below a | ||
437 | register that is shared with any other variable. That's | ||
438 | because we modify p4 and p5 there, but we can't mark them | ||
439 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
440 | limit of GCC < 3.1. */ | ||
441 | __asm__ ("" : "+r" (p4), "+r" (p5)); | ||
442 | |||
443 | __asm__ __volatile__ ( | ||
444 | " .align 32,0x90 ;\n" | ||
445 | " 1: ;\n" | ||
446 | " movq (%1), %%mm0 ;\n" | ||
447 | " movq 8(%1), %%mm1 ;\n" | ||
448 | " pxor (%2), %%mm0 ;\n" | ||
449 | " pxor 8(%2), %%mm1 ;\n" | ||
450 | " movq 16(%1), %%mm2 ;\n" | ||
451 | " pxor (%3), %%mm0 ;\n" | ||
452 | " pxor 8(%3), %%mm1 ;\n" | ||
453 | " pxor 16(%2), %%mm2 ;\n" | ||
454 | " pxor (%4), %%mm0 ;\n" | ||
455 | " pxor 8(%4), %%mm1 ;\n" | ||
456 | " pxor 16(%3), %%mm2 ;\n" | ||
457 | " movq 24(%1), %%mm3 ;\n" | ||
458 | " pxor (%5), %%mm0 ;\n" | ||
459 | " pxor 8(%5), %%mm1 ;\n" | ||
460 | " movq %%mm0, (%1) ;\n" | ||
461 | " pxor 16(%4), %%mm2 ;\n" | ||
462 | " pxor 24(%2), %%mm3 ;\n" | ||
463 | " movq %%mm1, 8(%1) ;\n" | ||
464 | " pxor 16(%5), %%mm2 ;\n" | ||
465 | " pxor 24(%3), %%mm3 ;\n" | ||
466 | " movq 32(%1), %%mm4 ;\n" | ||
467 | " movq %%mm2, 16(%1) ;\n" | ||
468 | " pxor 24(%4), %%mm3 ;\n" | ||
469 | " pxor 32(%2), %%mm4 ;\n" | ||
470 | " movq 40(%1), %%mm5 ;\n" | ||
471 | " pxor 24(%5), %%mm3 ;\n" | ||
472 | " pxor 32(%3), %%mm4 ;\n" | ||
473 | " pxor 40(%2), %%mm5 ;\n" | ||
474 | " movq %%mm3, 24(%1) ;\n" | ||
475 | " pxor 32(%4), %%mm4 ;\n" | ||
476 | " pxor 40(%3), %%mm5 ;\n" | ||
477 | " movq 48(%1), %%mm6 ;\n" | ||
478 | " movq 56(%1), %%mm7 ;\n" | ||
479 | " pxor 32(%5), %%mm4 ;\n" | ||
480 | " pxor 40(%4), %%mm5 ;\n" | ||
481 | " pxor 48(%2), %%mm6 ;\n" | ||
482 | " pxor 56(%2), %%mm7 ;\n" | ||
483 | " movq %%mm4, 32(%1) ;\n" | ||
484 | " pxor 48(%3), %%mm6 ;\n" | ||
485 | " pxor 56(%3), %%mm7 ;\n" | ||
486 | " pxor 40(%5), %%mm5 ;\n" | ||
487 | " pxor 48(%4), %%mm6 ;\n" | ||
488 | " pxor 56(%4), %%mm7 ;\n" | ||
489 | " movq %%mm5, 40(%1) ;\n" | ||
490 | " pxor 48(%5), %%mm6 ;\n" | ||
491 | " pxor 56(%5), %%mm7 ;\n" | ||
492 | " movq %%mm6, 48(%1) ;\n" | ||
493 | " movq %%mm7, 56(%1) ;\n" | ||
494 | |||
495 | " addl $64, %1 ;\n" | ||
496 | " addl $64, %2 ;\n" | ||
497 | " addl $64, %3 ;\n" | ||
498 | " addl $64, %4 ;\n" | ||
499 | " addl $64, %5 ;\n" | ||
500 | " decl %0 ;\n" | ||
501 | " jnz 1b ;\n" | ||
502 | : "+r" (lines), | ||
503 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
504 | : "r" (p4), "r" (p5) | ||
505 | : "memory"); | ||
506 | |||
507 | /* p4 and p5 were modified, and now the variables are dead. | ||
508 | Clobber them just to be sure nobody does something stupid | ||
509 | like assuming they have some legal value. */ | ||
510 | __asm__ ("" : "=r" (p4), "=r" (p5)); | ||
511 | |||
512 | kernel_fpu_end(); | ||
513 | } | ||
514 | |||
515 | static struct xor_block_template xor_block_pII_mmx = { | ||
516 | .name = "pII_mmx", | ||
517 | .do_2 = xor_pII_mmx_2, | ||
518 | .do_3 = xor_pII_mmx_3, | ||
519 | .do_4 = xor_pII_mmx_4, | ||
520 | .do_5 = xor_pII_mmx_5, | ||
521 | }; | ||
522 | |||
523 | static struct xor_block_template xor_block_p5_mmx = { | ||
524 | .name = "p5_mmx", | ||
525 | .do_2 = xor_p5_mmx_2, | ||
526 | .do_3 = xor_p5_mmx_3, | ||
527 | .do_4 = xor_p5_mmx_4, | ||
528 | .do_5 = xor_p5_mmx_5, | ||
529 | }; | ||
530 | |||
531 | /* | ||
532 | * Cache avoiding checksumming functions utilizing KNI instructions | ||
533 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | ||
534 | */ | ||
535 | |||
536 | #define XMMS_SAVE do { \ | ||
537 | preempt_disable(); \ | ||
538 | __asm__ __volatile__ ( \ | ||
539 | "movl %%cr0,%0 ;\n\t" \ | ||
540 | "clts ;\n\t" \ | ||
541 | "movups %%xmm0,(%1) ;\n\t" \ | ||
542 | "movups %%xmm1,0x10(%1) ;\n\t" \ | ||
543 | "movups %%xmm2,0x20(%1) ;\n\t" \ | ||
544 | "movups %%xmm3,0x30(%1) ;\n\t" \ | ||
545 | : "=&r" (cr0) \ | ||
546 | : "r" (xmm_save) \ | ||
547 | : "memory"); \ | ||
548 | } while(0) | ||
549 | |||
550 | #define XMMS_RESTORE do { \ | ||
551 | __asm__ __volatile__ ( \ | ||
552 | "sfence ;\n\t" \ | ||
553 | "movups (%1),%%xmm0 ;\n\t" \ | ||
554 | "movups 0x10(%1),%%xmm1 ;\n\t" \ | ||
555 | "movups 0x20(%1),%%xmm2 ;\n\t" \ | ||
556 | "movups 0x30(%1),%%xmm3 ;\n\t" \ | ||
557 | "movl %0,%%cr0 ;\n\t" \ | ||
558 | : \ | ||
559 | : "r" (cr0), "r" (xmm_save) \ | ||
560 | : "memory"); \ | ||
561 | preempt_enable(); \ | ||
562 | } while(0) | ||
563 | |||
564 | #define ALIGN16 __attribute__((aligned(16))) | ||
565 | |||
566 | #define OFFS(x) "16*("#x")" | ||
567 | #define PF_OFFS(x) "256+16*("#x")" | ||
568 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | ||
569 | #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | ||
570 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | ||
571 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | ||
572 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | ||
573 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | ||
574 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | ||
575 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | ||
576 | #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | ||
577 | #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | ||
578 | #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | ||
579 | #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | ||
580 | #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | ||
581 | |||
582 | |||
583 | static void | ||
584 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
585 | { | ||
586 | unsigned long lines = bytes >> 8; | ||
587 | char xmm_save[16*4] ALIGN16; | ||
588 | int cr0; | ||
589 | |||
590 | XMMS_SAVE; | ||
591 | |||
592 | __asm__ __volatile__ ( | ||
593 | #undef BLOCK | ||
594 | #define BLOCK(i) \ | ||
595 | LD(i,0) \ | ||
596 | LD(i+1,1) \ | ||
597 | PF1(i) \ | ||
598 | PF1(i+2) \ | ||
599 | LD(i+2,2) \ | ||
600 | LD(i+3,3) \ | ||
601 | PF0(i+4) \ | ||
602 | PF0(i+6) \ | ||
603 | XO1(i,0) \ | ||
604 | XO1(i+1,1) \ | ||
605 | XO1(i+2,2) \ | ||
606 | XO1(i+3,3) \ | ||
607 | ST(i,0) \ | ||
608 | ST(i+1,1) \ | ||
609 | ST(i+2,2) \ | ||
610 | ST(i+3,3) \ | ||
611 | |||
612 | |||
613 | PF0(0) | ||
614 | PF0(2) | ||
615 | |||
616 | " .align 32 ;\n" | ||
617 | " 1: ;\n" | ||
618 | |||
619 | BLOCK(0) | ||
620 | BLOCK(4) | ||
621 | BLOCK(8) | ||
622 | BLOCK(12) | ||
623 | |||
624 | " addl $256, %1 ;\n" | ||
625 | " addl $256, %2 ;\n" | ||
626 | " decl %0 ;\n" | ||
627 | " jnz 1b ;\n" | ||
628 | : "+r" (lines), | ||
629 | "+r" (p1), "+r" (p2) | ||
630 | : | ||
631 | : "memory"); | ||
632 | |||
633 | XMMS_RESTORE; | ||
634 | } | ||
635 | |||
636 | static void | ||
637 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
638 | unsigned long *p3) | ||
639 | { | ||
640 | unsigned long lines = bytes >> 8; | ||
641 | char xmm_save[16*4] ALIGN16; | ||
642 | int cr0; | ||
643 | |||
644 | XMMS_SAVE; | ||
645 | |||
646 | __asm__ __volatile__ ( | ||
647 | #undef BLOCK | ||
648 | #define BLOCK(i) \ | ||
649 | PF1(i) \ | ||
650 | PF1(i+2) \ | ||
651 | LD(i,0) \ | ||
652 | LD(i+1,1) \ | ||
653 | LD(i+2,2) \ | ||
654 | LD(i+3,3) \ | ||
655 | PF2(i) \ | ||
656 | PF2(i+2) \ | ||
657 | PF0(i+4) \ | ||
658 | PF0(i+6) \ | ||
659 | XO1(i,0) \ | ||
660 | XO1(i+1,1) \ | ||
661 | XO1(i+2,2) \ | ||
662 | XO1(i+3,3) \ | ||
663 | XO2(i,0) \ | ||
664 | XO2(i+1,1) \ | ||
665 | XO2(i+2,2) \ | ||
666 | XO2(i+3,3) \ | ||
667 | ST(i,0) \ | ||
668 | ST(i+1,1) \ | ||
669 | ST(i+2,2) \ | ||
670 | ST(i+3,3) \ | ||
671 | |||
672 | |||
673 | PF0(0) | ||
674 | PF0(2) | ||
675 | |||
676 | " .align 32 ;\n" | ||
677 | " 1: ;\n" | ||
678 | |||
679 | BLOCK(0) | ||
680 | BLOCK(4) | ||
681 | BLOCK(8) | ||
682 | BLOCK(12) | ||
683 | |||
684 | " addl $256, %1 ;\n" | ||
685 | " addl $256, %2 ;\n" | ||
686 | " addl $256, %3 ;\n" | ||
687 | " decl %0 ;\n" | ||
688 | " jnz 1b ;\n" | ||
689 | : "+r" (lines), | ||
690 | "+r" (p1), "+r"(p2), "+r"(p3) | ||
691 | : | ||
692 | : "memory" ); | ||
693 | |||
694 | XMMS_RESTORE; | ||
695 | } | ||
696 | |||
697 | static void | ||
698 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
699 | unsigned long *p3, unsigned long *p4) | ||
700 | { | ||
701 | unsigned long lines = bytes >> 8; | ||
702 | char xmm_save[16*4] ALIGN16; | ||
703 | int cr0; | ||
704 | |||
705 | XMMS_SAVE; | ||
706 | |||
707 | __asm__ __volatile__ ( | ||
708 | #undef BLOCK | ||
709 | #define BLOCK(i) \ | ||
710 | PF1(i) \ | ||
711 | PF1(i+2) \ | ||
712 | LD(i,0) \ | ||
713 | LD(i+1,1) \ | ||
714 | LD(i+2,2) \ | ||
715 | LD(i+3,3) \ | ||
716 | PF2(i) \ | ||
717 | PF2(i+2) \ | ||
718 | XO1(i,0) \ | ||
719 | XO1(i+1,1) \ | ||
720 | XO1(i+2,2) \ | ||
721 | XO1(i+3,3) \ | ||
722 | PF3(i) \ | ||
723 | PF3(i+2) \ | ||
724 | PF0(i+4) \ | ||
725 | PF0(i+6) \ | ||
726 | XO2(i,0) \ | ||
727 | XO2(i+1,1) \ | ||
728 | XO2(i+2,2) \ | ||
729 | XO2(i+3,3) \ | ||
730 | XO3(i,0) \ | ||
731 | XO3(i+1,1) \ | ||
732 | XO3(i+2,2) \ | ||
733 | XO3(i+3,3) \ | ||
734 | ST(i,0) \ | ||
735 | ST(i+1,1) \ | ||
736 | ST(i+2,2) \ | ||
737 | ST(i+3,3) \ | ||
738 | |||
739 | |||
740 | PF0(0) | ||
741 | PF0(2) | ||
742 | |||
743 | " .align 32 ;\n" | ||
744 | " 1: ;\n" | ||
745 | |||
746 | BLOCK(0) | ||
747 | BLOCK(4) | ||
748 | BLOCK(8) | ||
749 | BLOCK(12) | ||
750 | |||
751 | " addl $256, %1 ;\n" | ||
752 | " addl $256, %2 ;\n" | ||
753 | " addl $256, %3 ;\n" | ||
754 | " addl $256, %4 ;\n" | ||
755 | " decl %0 ;\n" | ||
756 | " jnz 1b ;\n" | ||
757 | : "+r" (lines), | ||
758 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | ||
759 | : | ||
760 | : "memory" ); | ||
761 | |||
762 | XMMS_RESTORE; | ||
763 | } | ||
764 | |||
765 | static void | ||
766 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
767 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
768 | { | ||
769 | unsigned long lines = bytes >> 8; | ||
770 | char xmm_save[16*4] ALIGN16; | ||
771 | int cr0; | ||
772 | |||
773 | XMMS_SAVE; | ||
774 | |||
775 | /* Make sure GCC forgets anything it knows about p4 or p5, | ||
776 | such that it won't pass to the asm volatile below a | ||
777 | register that is shared with any other variable. That's | ||
778 | because we modify p4 and p5 there, but we can't mark them | ||
779 | as read/write, otherwise we'd overflow the 10-asm-operands | ||
780 | limit of GCC < 3.1. */ | ||
781 | __asm__ ("" : "+r" (p4), "+r" (p5)); | ||
782 | |||
783 | __asm__ __volatile__ ( | ||
784 | #undef BLOCK | ||
785 | #define BLOCK(i) \ | ||
786 | PF1(i) \ | ||
787 | PF1(i+2) \ | ||
788 | LD(i,0) \ | ||
789 | LD(i+1,1) \ | ||
790 | LD(i+2,2) \ | ||
791 | LD(i+3,3) \ | ||
792 | PF2(i) \ | ||
793 | PF2(i+2) \ | ||
794 | XO1(i,0) \ | ||
795 | XO1(i+1,1) \ | ||
796 | XO1(i+2,2) \ | ||
797 | XO1(i+3,3) \ | ||
798 | PF3(i) \ | ||
799 | PF3(i+2) \ | ||
800 | XO2(i,0) \ | ||
801 | XO2(i+1,1) \ | ||
802 | XO2(i+2,2) \ | ||
803 | XO2(i+3,3) \ | ||
804 | PF4(i) \ | ||
805 | PF4(i+2) \ | ||
806 | PF0(i+4) \ | ||
807 | PF0(i+6) \ | ||
808 | XO3(i,0) \ | ||
809 | XO3(i+1,1) \ | ||
810 | XO3(i+2,2) \ | ||
811 | XO3(i+3,3) \ | ||
812 | XO4(i,0) \ | ||
813 | XO4(i+1,1) \ | ||
814 | XO4(i+2,2) \ | ||
815 | XO4(i+3,3) \ | ||
816 | ST(i,0) \ | ||
817 | ST(i+1,1) \ | ||
818 | ST(i+2,2) \ | ||
819 | ST(i+3,3) \ | ||
820 | |||
821 | |||
822 | PF0(0) | ||
823 | PF0(2) | ||
824 | |||
825 | " .align 32 ;\n" | ||
826 | " 1: ;\n" | ||
827 | |||
828 | BLOCK(0) | ||
829 | BLOCK(4) | ||
830 | BLOCK(8) | ||
831 | BLOCK(12) | ||
832 | |||
833 | " addl $256, %1 ;\n" | ||
834 | " addl $256, %2 ;\n" | ||
835 | " addl $256, %3 ;\n" | ||
836 | " addl $256, %4 ;\n" | ||
837 | " addl $256, %5 ;\n" | ||
838 | " decl %0 ;\n" | ||
839 | " jnz 1b ;\n" | ||
840 | : "+r" (lines), | ||
841 | "+r" (p1), "+r" (p2), "+r" (p3) | ||
842 | : "r" (p4), "r" (p5) | ||
843 | : "memory"); | ||
844 | |||
845 | /* p4 and p5 were modified, and now the variables are dead. | ||
846 | Clobber them just to be sure nobody does something stupid | ||
847 | like assuming they have some legal value. */ | ||
848 | __asm__ ("" : "=r" (p4), "=r" (p5)); | ||
849 | |||
850 | XMMS_RESTORE; | ||
851 | } | ||
852 | |||
853 | static struct xor_block_template xor_block_pIII_sse = { | ||
854 | .name = "pIII_sse", | ||
855 | .do_2 = xor_sse_2, | ||
856 | .do_3 = xor_sse_3, | ||
857 | .do_4 = xor_sse_4, | ||
858 | .do_5 = xor_sse_5, | ||
859 | }; | ||
860 | |||
861 | /* Also try the generic routines. */ | ||
862 | #include <asm-generic/xor.h> | ||
863 | |||
864 | #undef XOR_TRY_TEMPLATES | ||
865 | #define XOR_TRY_TEMPLATES \ | ||
866 | do { \ | ||
867 | xor_speed(&xor_block_8regs); \ | ||
868 | xor_speed(&xor_block_8regs_p); \ | ||
869 | xor_speed(&xor_block_32regs); \ | ||
870 | xor_speed(&xor_block_32regs_p); \ | ||
871 | if (cpu_has_xmm) \ | ||
872 | xor_speed(&xor_block_pIII_sse); \ | ||
873 | if (cpu_has_mmx) { \ | ||
874 | xor_speed(&xor_block_pII_mmx); \ | ||
875 | xor_speed(&xor_block_p5_mmx); \ | ||
876 | } \ | ||
877 | } while (0) | ||
878 | |||
879 | /* We force the use of the SSE xor block because it can write around L2. | ||
880 | We may also be able to load into the L1 only depending on how the cpu | ||
881 | deals with a load to a line that is being prefetched. */ | ||
882 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
883 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | ||