diff options
-rw-r--r-- | include/asm-x86/xor_32.h | 494 |
1 files changed, 248 insertions, 246 deletions
diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h index a41ef1bdd424..067b5c1835a3 100644 --- a/include/asm-x86/xor_32.h +++ b/include/asm-x86/xor_32.h | |||
@@ -16,12 +16,12 @@ | |||
16 | * Copyright (C) 1998 Ingo Molnar. | 16 | * Copyright (C) 1998 Ingo Molnar. |
17 | */ | 17 | */ |
18 | 18 | ||
19 | #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" | 19 | #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" |
20 | #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" | 20 | #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" |
21 | #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" | 21 | #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" |
22 | #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" | 22 | #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" |
23 | #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" | 23 | #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" |
24 | #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" | 24 | #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" |
25 | 25 | ||
26 | #include <asm/i387.h> | 26 | #include <asm/i387.h> |
27 | 27 | ||
@@ -32,24 +32,24 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
32 | 32 | ||
33 | kernel_fpu_begin(); | 33 | kernel_fpu_begin(); |
34 | 34 | ||
35 | __asm__ __volatile__ ( | 35 | asm volatile( |
36 | #undef BLOCK | 36 | #undef BLOCK |
37 | #define BLOCK(i) \ | 37 | #define BLOCK(i) \ |
38 | LD(i,0) \ | 38 | LD(i, 0) \ |
39 | LD(i+1,1) \ | 39 | LD(i + 1, 1) \ |
40 | LD(i+2,2) \ | 40 | LD(i + 2, 2) \ |
41 | LD(i+3,3) \ | 41 | LD(i + 3, 3) \ |
42 | XO1(i,0) \ | 42 | XO1(i, 0) \ |
43 | ST(i,0) \ | 43 | ST(i, 0) \ |
44 | XO1(i+1,1) \ | 44 | XO1(i+1, 1) \ |
45 | ST(i+1,1) \ | 45 | ST(i+1, 1) \ |
46 | XO1(i+2,2) \ | 46 | XO1(i + 2, 2) \ |
47 | ST(i+2,2) \ | 47 | ST(i + 2, 2) \ |
48 | XO1(i+3,3) \ | 48 | XO1(i + 3, 3) \ |
49 | ST(i+3,3) | 49 | ST(i + 3, 3) |
50 | 50 | ||
51 | " .align 32 ;\n" | 51 | " .align 32 ;\n" |
52 | " 1: ;\n" | 52 | " 1: ;\n" |
53 | 53 | ||
54 | BLOCK(0) | 54 | BLOCK(0) |
55 | BLOCK(4) | 55 | BLOCK(4) |
@@ -76,25 +76,25 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
76 | 76 | ||
77 | kernel_fpu_begin(); | 77 | kernel_fpu_begin(); |
78 | 78 | ||
79 | __asm__ __volatile__ ( | 79 | asm volatile( |
80 | #undef BLOCK | 80 | #undef BLOCK |
81 | #define BLOCK(i) \ | 81 | #define BLOCK(i) \ |
82 | LD(i,0) \ | 82 | LD(i, 0) \ |
83 | LD(i+1,1) \ | 83 | LD(i + 1, 1) \ |
84 | LD(i+2,2) \ | 84 | LD(i + 2, 2) \ |
85 | LD(i+3,3) \ | 85 | LD(i + 3, 3) \ |
86 | XO1(i,0) \ | 86 | XO1(i, 0) \ |
87 | XO1(i+1,1) \ | 87 | XO1(i + 1, 1) \ |
88 | XO1(i+2,2) \ | 88 | XO1(i + 2, 2) \ |
89 | XO1(i+3,3) \ | 89 | XO1(i + 3, 3) \ |
90 | XO2(i,0) \ | 90 | XO2(i, 0) \ |
91 | ST(i,0) \ | 91 | ST(i, 0) \ |
92 | XO2(i+1,1) \ | 92 | XO2(i + 1, 1) \ |
93 | ST(i+1,1) \ | 93 | ST(i + 1, 1) \ |
94 | XO2(i+2,2) \ | 94 | XO2(i + 2, 2) \ |
95 | ST(i+2,2) \ | 95 | ST(i + 2, 2) \ |
96 | XO2(i+3,3) \ | 96 | XO2(i + 3, 3) \ |
97 | ST(i+3,3) | 97 | ST(i + 3, 3) |
98 | 98 | ||
99 | " .align 32 ;\n" | 99 | " .align 32 ;\n" |
100 | " 1: ;\n" | 100 | " 1: ;\n" |
@@ -125,29 +125,29 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
125 | 125 | ||
126 | kernel_fpu_begin(); | 126 | kernel_fpu_begin(); |
127 | 127 | ||
128 | __asm__ __volatile__ ( | 128 | asm volatile( |
129 | #undef BLOCK | 129 | #undef BLOCK |
130 | #define BLOCK(i) \ | 130 | #define BLOCK(i) \ |
131 | LD(i,0) \ | 131 | LD(i, 0) \ |
132 | LD(i+1,1) \ | 132 | LD(i + 1, 1) \ |
133 | LD(i+2,2) \ | 133 | LD(i + 2, 2) \ |
134 | LD(i+3,3) \ | 134 | LD(i + 3, 3) \ |
135 | XO1(i,0) \ | 135 | XO1(i, 0) \ |
136 | XO1(i+1,1) \ | 136 | XO1(i + 1, 1) \ |
137 | XO1(i+2,2) \ | 137 | XO1(i + 2, 2) \ |
138 | XO1(i+3,3) \ | 138 | XO1(i + 3, 3) \ |
139 | XO2(i,0) \ | 139 | XO2(i, 0) \ |
140 | XO2(i+1,1) \ | 140 | XO2(i + 1, 1) \ |
141 | XO2(i+2,2) \ | 141 | XO2(i + 2, 2) \ |
142 | XO2(i+3,3) \ | 142 | XO2(i + 3, 3) \ |
143 | XO3(i,0) \ | 143 | XO3(i, 0) \ |
144 | ST(i,0) \ | 144 | ST(i, 0) \ |
145 | XO3(i+1,1) \ | 145 | XO3(i + 1, 1) \ |
146 | ST(i+1,1) \ | 146 | ST(i + 1, 1) \ |
147 | XO3(i+2,2) \ | 147 | XO3(i + 2, 2) \ |
148 | ST(i+2,2) \ | 148 | ST(i + 2, 2) \ |
149 | XO3(i+3,3) \ | 149 | XO3(i + 3, 3) \ |
150 | ST(i+3,3) | 150 | ST(i + 3, 3) |
151 | 151 | ||
152 | " .align 32 ;\n" | 152 | " .align 32 ;\n" |
153 | " 1: ;\n" | 153 | " 1: ;\n" |
@@ -186,35 +186,35 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
186 | because we modify p4 and p5 there, but we can't mark them | 186 | because we modify p4 and p5 there, but we can't mark them |
187 | as read/write, otherwise we'd overflow the 10-asm-operands | 187 | as read/write, otherwise we'd overflow the 10-asm-operands |
188 | limit of GCC < 3.1. */ | 188 | limit of GCC < 3.1. */ |
189 | __asm__ ("" : "+r" (p4), "+r" (p5)); | 189 | asm("" : "+r" (p4), "+r" (p5)); |
190 | 190 | ||
191 | __asm__ __volatile__ ( | 191 | asm volatile( |
192 | #undef BLOCK | 192 | #undef BLOCK |
193 | #define BLOCK(i) \ | 193 | #define BLOCK(i) \ |
194 | LD(i,0) \ | 194 | LD(i, 0) \ |
195 | LD(i+1,1) \ | 195 | LD(i + 1, 1) \ |
196 | LD(i+2,2) \ | 196 | LD(i + 2, 2) \ |
197 | LD(i+3,3) \ | 197 | LD(i + 3, 3) \ |
198 | XO1(i,0) \ | 198 | XO1(i, 0) \ |
199 | XO1(i+1,1) \ | 199 | XO1(i + 1, 1) \ |
200 | XO1(i+2,2) \ | 200 | XO1(i + 2, 2) \ |
201 | XO1(i+3,3) \ | 201 | XO1(i + 3, 3) \ |
202 | XO2(i,0) \ | 202 | XO2(i, 0) \ |
203 | XO2(i+1,1) \ | 203 | XO2(i + 1, 1) \ |
204 | XO2(i+2,2) \ | 204 | XO2(i + 2, 2) \ |
205 | XO2(i+3,3) \ | 205 | XO2(i + 3, 3) \ |
206 | XO3(i,0) \ | 206 | XO3(i, 0) \ |
207 | XO3(i+1,1) \ | 207 | XO3(i + 1, 1) \ |
208 | XO3(i+2,2) \ | 208 | XO3(i + 2, 2) \ |
209 | XO3(i+3,3) \ | 209 | XO3(i + 3, 3) \ |
210 | XO4(i,0) \ | 210 | XO4(i, 0) \ |
211 | ST(i,0) \ | 211 | ST(i, 0) \ |
212 | XO4(i+1,1) \ | 212 | XO4(i + 1, 1) \ |
213 | ST(i+1,1) \ | 213 | ST(i + 1, 1) \ |
214 | XO4(i+2,2) \ | 214 | XO4(i + 2, 2) \ |
215 | ST(i+2,2) \ | 215 | ST(i + 2, 2) \ |
216 | XO4(i+3,3) \ | 216 | XO4(i + 3, 3) \ |
217 | ST(i+3,3) | 217 | ST(i + 3, 3) |
218 | 218 | ||
219 | " .align 32 ;\n" | 219 | " .align 32 ;\n" |
220 | " 1: ;\n" | 220 | " 1: ;\n" |
@@ -233,13 +233,13 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
233 | " jnz 1b ;\n" | 233 | " jnz 1b ;\n" |
234 | : "+r" (lines), | 234 | : "+r" (lines), |
235 | "+r" (p1), "+r" (p2), "+r" (p3) | 235 | "+r" (p1), "+r" (p2), "+r" (p3) |
236 | : "r" (p4), "r" (p5) | 236 | : "r" (p4), "r" (p5) |
237 | : "memory"); | 237 | : "memory"); |
238 | 238 | ||
239 | /* p4 and p5 were modified, and now the variables are dead. | 239 | /* p4 and p5 were modified, and now the variables are dead. |
240 | Clobber them just to be sure nobody does something stupid | 240 | Clobber them just to be sure nobody does something stupid |
241 | like assuming they have some legal value. */ | 241 | like assuming they have some legal value. */ |
242 | __asm__ ("" : "=r" (p4), "=r" (p5)); | 242 | asm("" : "=r" (p4), "=r" (p5)); |
243 | 243 | ||
244 | kernel_fpu_end(); | 244 | kernel_fpu_end(); |
245 | } | 245 | } |
@@ -259,7 +259,7 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
259 | 259 | ||
260 | kernel_fpu_begin(); | 260 | kernel_fpu_begin(); |
261 | 261 | ||
262 | __asm__ __volatile__ ( | 262 | asm volatile( |
263 | " .align 32 ;\n" | 263 | " .align 32 ;\n" |
264 | " 1: ;\n" | 264 | " 1: ;\n" |
265 | " movq (%1), %%mm0 ;\n" | 265 | " movq (%1), %%mm0 ;\n" |
@@ -286,7 +286,7 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
286 | " pxor 56(%2), %%mm7 ;\n" | 286 | " pxor 56(%2), %%mm7 ;\n" |
287 | " movq %%mm6, 48(%1) ;\n" | 287 | " movq %%mm6, 48(%1) ;\n" |
288 | " movq %%mm7, 56(%1) ;\n" | 288 | " movq %%mm7, 56(%1) ;\n" |
289 | 289 | ||
290 | " addl $64, %1 ;\n" | 290 | " addl $64, %1 ;\n" |
291 | " addl $64, %2 ;\n" | 291 | " addl $64, %2 ;\n" |
292 | " decl %0 ;\n" | 292 | " decl %0 ;\n" |
@@ -307,7 +307,7 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
307 | 307 | ||
308 | kernel_fpu_begin(); | 308 | kernel_fpu_begin(); |
309 | 309 | ||
310 | __asm__ __volatile__ ( | 310 | asm volatile( |
311 | " .align 32,0x90 ;\n" | 311 | " .align 32,0x90 ;\n" |
312 | " 1: ;\n" | 312 | " 1: ;\n" |
313 | " movq (%1), %%mm0 ;\n" | 313 | " movq (%1), %%mm0 ;\n" |
@@ -342,7 +342,7 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
342 | " pxor 56(%3), %%mm7 ;\n" | 342 | " pxor 56(%3), %%mm7 ;\n" |
343 | " movq %%mm6, 48(%1) ;\n" | 343 | " movq %%mm6, 48(%1) ;\n" |
344 | " movq %%mm7, 56(%1) ;\n" | 344 | " movq %%mm7, 56(%1) ;\n" |
345 | 345 | ||
346 | " addl $64, %1 ;\n" | 346 | " addl $64, %1 ;\n" |
347 | " addl $64, %2 ;\n" | 347 | " addl $64, %2 ;\n" |
348 | " addl $64, %3 ;\n" | 348 | " addl $64, %3 ;\n" |
@@ -364,7 +364,7 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
364 | 364 | ||
365 | kernel_fpu_begin(); | 365 | kernel_fpu_begin(); |
366 | 366 | ||
367 | __asm__ __volatile__ ( | 367 | asm volatile( |
368 | " .align 32,0x90 ;\n" | 368 | " .align 32,0x90 ;\n" |
369 | " 1: ;\n" | 369 | " 1: ;\n" |
370 | " movq (%1), %%mm0 ;\n" | 370 | " movq (%1), %%mm0 ;\n" |
@@ -407,7 +407,7 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
407 | " pxor 56(%4), %%mm7 ;\n" | 407 | " pxor 56(%4), %%mm7 ;\n" |
408 | " movq %%mm6, 48(%1) ;\n" | 408 | " movq %%mm6, 48(%1) ;\n" |
409 | " movq %%mm7, 56(%1) ;\n" | 409 | " movq %%mm7, 56(%1) ;\n" |
410 | 410 | ||
411 | " addl $64, %1 ;\n" | 411 | " addl $64, %1 ;\n" |
412 | " addl $64, %2 ;\n" | 412 | " addl $64, %2 ;\n" |
413 | " addl $64, %3 ;\n" | 413 | " addl $64, %3 ;\n" |
@@ -436,9 +436,9 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
436 | because we modify p4 and p5 there, but we can't mark them | 436 | because we modify p4 and p5 there, but we can't mark them |
437 | as read/write, otherwise we'd overflow the 10-asm-operands | 437 | as read/write, otherwise we'd overflow the 10-asm-operands |
438 | limit of GCC < 3.1. */ | 438 | limit of GCC < 3.1. */ |
439 | __asm__ ("" : "+r" (p4), "+r" (p5)); | 439 | asm("" : "+r" (p4), "+r" (p5)); |
440 | 440 | ||
441 | __asm__ __volatile__ ( | 441 | asm volatile( |
442 | " .align 32,0x90 ;\n" | 442 | " .align 32,0x90 ;\n" |
443 | " 1: ;\n" | 443 | " 1: ;\n" |
444 | " movq (%1), %%mm0 ;\n" | 444 | " movq (%1), %%mm0 ;\n" |
@@ -489,7 +489,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
489 | " pxor 56(%5), %%mm7 ;\n" | 489 | " pxor 56(%5), %%mm7 ;\n" |
490 | " movq %%mm6, 48(%1) ;\n" | 490 | " movq %%mm6, 48(%1) ;\n" |
491 | " movq %%mm7, 56(%1) ;\n" | 491 | " movq %%mm7, 56(%1) ;\n" |
492 | 492 | ||
493 | " addl $64, %1 ;\n" | 493 | " addl $64, %1 ;\n" |
494 | " addl $64, %2 ;\n" | 494 | " addl $64, %2 ;\n" |
495 | " addl $64, %3 ;\n" | 495 | " addl $64, %3 ;\n" |
@@ -505,7 +505,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
505 | /* p4 and p5 were modified, and now the variables are dead. | 505 | /* p4 and p5 were modified, and now the variables are dead. |
506 | Clobber them just to be sure nobody does something stupid | 506 | Clobber them just to be sure nobody does something stupid |
507 | like assuming they have some legal value. */ | 507 | like assuming they have some legal value. */ |
508 | __asm__ ("" : "=r" (p4), "=r" (p5)); | 508 | asm("" : "=r" (p4), "=r" (p5)); |
509 | 509 | ||
510 | kernel_fpu_end(); | 510 | kernel_fpu_end(); |
511 | } | 511 | } |
@@ -531,11 +531,12 @@ static struct xor_block_template xor_block_p5_mmx = { | |||
531 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | 531 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) |
532 | */ | 532 | */ |
533 | 533 | ||
534 | #define XMMS_SAVE do { \ | 534 | #define XMMS_SAVE \ |
535 | do { \ | ||
535 | preempt_disable(); \ | 536 | preempt_disable(); \ |
536 | cr0 = read_cr0(); \ | 537 | cr0 = read_cr0(); \ |
537 | clts(); \ | 538 | clts(); \ |
538 | __asm__ __volatile__ ( \ | 539 | asm volatile( \ |
539 | "movups %%xmm0,(%0) ;\n\t" \ | 540 | "movups %%xmm0,(%0) ;\n\t" \ |
540 | "movups %%xmm1,0x10(%0) ;\n\t" \ | 541 | "movups %%xmm1,0x10(%0) ;\n\t" \ |
541 | "movups %%xmm2,0x20(%0) ;\n\t" \ | 542 | "movups %%xmm2,0x20(%0) ;\n\t" \ |
@@ -543,10 +544,11 @@ static struct xor_block_template xor_block_p5_mmx = { | |||
543 | : \ | 544 | : \ |
544 | : "r" (xmm_save) \ | 545 | : "r" (xmm_save) \ |
545 | : "memory"); \ | 546 | : "memory"); \ |
546 | } while(0) | 547 | } while (0) |
547 | 548 | ||
548 | #define XMMS_RESTORE do { \ | 549 | #define XMMS_RESTORE \ |
549 | __asm__ __volatile__ ( \ | 550 | do { \ |
551 | asm volatile( \ | ||
550 | "sfence ;\n\t" \ | 552 | "sfence ;\n\t" \ |
551 | "movups (%0),%%xmm0 ;\n\t" \ | 553 | "movups (%0),%%xmm0 ;\n\t" \ |
552 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | 554 | "movups 0x10(%0),%%xmm1 ;\n\t" \ |
@@ -557,76 +559,76 @@ static struct xor_block_template xor_block_p5_mmx = { | |||
557 | : "memory"); \ | 559 | : "memory"); \ |
558 | write_cr0(cr0); \ | 560 | write_cr0(cr0); \ |
559 | preempt_enable(); \ | 561 | preempt_enable(); \ |
560 | } while(0) | 562 | } while (0) |
561 | 563 | ||
562 | #define ALIGN16 __attribute__((aligned(16))) | 564 | #define ALIGN16 __attribute__((aligned(16))) |
563 | 565 | ||
564 | #define OFFS(x) "16*("#x")" | 566 | #define OFFS(x) "16*("#x")" |
565 | #define PF_OFFS(x) "256+16*("#x")" | 567 | #define PF_OFFS(x) "256+16*("#x")" |
566 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | 568 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" |
567 | #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" | 569 | #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" |
568 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" | 570 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" |
569 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" | 571 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" |
570 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" | 572 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" |
571 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" | 573 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" |
572 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" | 574 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" |
573 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" | 575 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" |
574 | #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" | 576 | #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" |
575 | #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" | 577 | #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" |
576 | #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" | 578 | #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" |
577 | #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" | 579 | #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" |
578 | #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" | 580 | #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" |
579 | 581 | ||
580 | 582 | ||
581 | static void | 583 | static void |
582 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 584 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
583 | { | 585 | { |
584 | unsigned long lines = bytes >> 8; | 586 | unsigned long lines = bytes >> 8; |
585 | char xmm_save[16*4] ALIGN16; | 587 | char xmm_save[16*4] ALIGN16; |
586 | int cr0; | 588 | int cr0; |
587 | 589 | ||
588 | XMMS_SAVE; | 590 | XMMS_SAVE; |
589 | 591 | ||
590 | __asm__ __volatile__ ( | 592 | asm volatile( |
591 | #undef BLOCK | 593 | #undef BLOCK |
592 | #define BLOCK(i) \ | 594 | #define BLOCK(i) \ |
593 | LD(i,0) \ | 595 | LD(i, 0) \ |
594 | LD(i+1,1) \ | 596 | LD(i + 1, 1) \ |
595 | PF1(i) \ | 597 | PF1(i) \ |
596 | PF1(i+2) \ | 598 | PF1(i + 2) \ |
597 | LD(i+2,2) \ | 599 | LD(i + 2, 2) \ |
598 | LD(i+3,3) \ | 600 | LD(i + 3, 3) \ |
599 | PF0(i+4) \ | 601 | PF0(i + 4) \ |
600 | PF0(i+6) \ | 602 | PF0(i + 6) \ |
601 | XO1(i,0) \ | 603 | XO1(i, 0) \ |
602 | XO1(i+1,1) \ | 604 | XO1(i + 1, 1) \ |
603 | XO1(i+2,2) \ | 605 | XO1(i + 2, 2) \ |
604 | XO1(i+3,3) \ | 606 | XO1(i + 3, 3) \ |
605 | ST(i,0) \ | 607 | ST(i, 0) \ |
606 | ST(i+1,1) \ | 608 | ST(i + 1, 1) \ |
607 | ST(i+2,2) \ | 609 | ST(i + 2, 2) \ |
608 | ST(i+3,3) \ | 610 | ST(i + 3, 3) \ |
609 | 611 | ||
610 | 612 | ||
611 | PF0(0) | 613 | PF0(0) |
612 | PF0(2) | 614 | PF0(2) |
613 | 615 | ||
614 | " .align 32 ;\n" | 616 | " .align 32 ;\n" |
615 | " 1: ;\n" | 617 | " 1: ;\n" |
616 | 618 | ||
617 | BLOCK(0) | 619 | BLOCK(0) |
618 | BLOCK(4) | 620 | BLOCK(4) |
619 | BLOCK(8) | 621 | BLOCK(8) |
620 | BLOCK(12) | 622 | BLOCK(12) |
621 | 623 | ||
622 | " addl $256, %1 ;\n" | 624 | " addl $256, %1 ;\n" |
623 | " addl $256, %2 ;\n" | 625 | " addl $256, %2 ;\n" |
624 | " decl %0 ;\n" | 626 | " decl %0 ;\n" |
625 | " jnz 1b ;\n" | 627 | " jnz 1b ;\n" |
626 | : "+r" (lines), | 628 | : "+r" (lines), |
627 | "+r" (p1), "+r" (p2) | 629 | "+r" (p1), "+r" (p2) |
628 | : | 630 | : |
629 | : "memory"); | 631 | : "memory"); |
630 | 632 | ||
631 | XMMS_RESTORE; | 633 | XMMS_RESTORE; |
632 | } | 634 | } |
@@ -635,59 +637,59 @@ static void | |||
635 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 637 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
636 | unsigned long *p3) | 638 | unsigned long *p3) |
637 | { | 639 | { |
638 | unsigned long lines = bytes >> 8; | 640 | unsigned long lines = bytes >> 8; |
639 | char xmm_save[16*4] ALIGN16; | 641 | char xmm_save[16*4] ALIGN16; |
640 | int cr0; | 642 | int cr0; |
641 | 643 | ||
642 | XMMS_SAVE; | 644 | XMMS_SAVE; |
643 | 645 | ||
644 | __asm__ __volatile__ ( | 646 | asm volatile( |
645 | #undef BLOCK | 647 | #undef BLOCK |
646 | #define BLOCK(i) \ | 648 | #define BLOCK(i) \ |
647 | PF1(i) \ | 649 | PF1(i) \ |
648 | PF1(i+2) \ | 650 | PF1(i + 2) \ |
649 | LD(i,0) \ | 651 | LD(i,0) \ |
650 | LD(i+1,1) \ | 652 | LD(i + 1, 1) \ |
651 | LD(i+2,2) \ | 653 | LD(i + 2, 2) \ |
652 | LD(i+3,3) \ | 654 | LD(i + 3, 3) \ |
653 | PF2(i) \ | 655 | PF2(i) \ |
654 | PF2(i+2) \ | 656 | PF2(i + 2) \ |
655 | PF0(i+4) \ | 657 | PF0(i + 4) \ |
656 | PF0(i+6) \ | 658 | PF0(i + 6) \ |
657 | XO1(i,0) \ | 659 | XO1(i,0) \ |
658 | XO1(i+1,1) \ | 660 | XO1(i + 1, 1) \ |
659 | XO1(i+2,2) \ | 661 | XO1(i + 2, 2) \ |
660 | XO1(i+3,3) \ | 662 | XO1(i + 3, 3) \ |
661 | XO2(i,0) \ | 663 | XO2(i,0) \ |
662 | XO2(i+1,1) \ | 664 | XO2(i + 1, 1) \ |
663 | XO2(i+2,2) \ | 665 | XO2(i + 2, 2) \ |
664 | XO2(i+3,3) \ | 666 | XO2(i + 3, 3) \ |
665 | ST(i,0) \ | 667 | ST(i,0) \ |
666 | ST(i+1,1) \ | 668 | ST(i + 1, 1) \ |
667 | ST(i+2,2) \ | 669 | ST(i + 2, 2) \ |
668 | ST(i+3,3) \ | 670 | ST(i + 3, 3) \ |
669 | 671 | ||
670 | 672 | ||
671 | PF0(0) | 673 | PF0(0) |
672 | PF0(2) | 674 | PF0(2) |
673 | 675 | ||
674 | " .align 32 ;\n" | 676 | " .align 32 ;\n" |
675 | " 1: ;\n" | 677 | " 1: ;\n" |
676 | 678 | ||
677 | BLOCK(0) | 679 | BLOCK(0) |
678 | BLOCK(4) | 680 | BLOCK(4) |
679 | BLOCK(8) | 681 | BLOCK(8) |
680 | BLOCK(12) | 682 | BLOCK(12) |
681 | 683 | ||
682 | " addl $256, %1 ;\n" | 684 | " addl $256, %1 ;\n" |
683 | " addl $256, %2 ;\n" | 685 | " addl $256, %2 ;\n" |
684 | " addl $256, %3 ;\n" | 686 | " addl $256, %3 ;\n" |
685 | " decl %0 ;\n" | 687 | " decl %0 ;\n" |
686 | " jnz 1b ;\n" | 688 | " jnz 1b ;\n" |
687 | : "+r" (lines), | 689 | : "+r" (lines), |
688 | "+r" (p1), "+r"(p2), "+r"(p3) | 690 | "+r" (p1), "+r"(p2), "+r"(p3) |
689 | : | 691 | : |
690 | : "memory" ); | 692 | : "memory" ); |
691 | 693 | ||
692 | XMMS_RESTORE; | 694 | XMMS_RESTORE; |
693 | } | 695 | } |
@@ -696,66 +698,66 @@ static void | |||
696 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 698 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
697 | unsigned long *p3, unsigned long *p4) | 699 | unsigned long *p3, unsigned long *p4) |
698 | { | 700 | { |
699 | unsigned long lines = bytes >> 8; | 701 | unsigned long lines = bytes >> 8; |
700 | char xmm_save[16*4] ALIGN16; | 702 | char xmm_save[16*4] ALIGN16; |
701 | int cr0; | 703 | int cr0; |
702 | 704 | ||
703 | XMMS_SAVE; | 705 | XMMS_SAVE; |
704 | 706 | ||
705 | __asm__ __volatile__ ( | 707 | asm volatile( |
706 | #undef BLOCK | 708 | #undef BLOCK |
707 | #define BLOCK(i) \ | 709 | #define BLOCK(i) \ |
708 | PF1(i) \ | 710 | PF1(i) \ |
709 | PF1(i+2) \ | 711 | PF1(i + 2) \ |
710 | LD(i,0) \ | 712 | LD(i,0) \ |
711 | LD(i+1,1) \ | 713 | LD(i + 1, 1) \ |
712 | LD(i+2,2) \ | 714 | LD(i + 2, 2) \ |
713 | LD(i+3,3) \ | 715 | LD(i + 3, 3) \ |
714 | PF2(i) \ | 716 | PF2(i) \ |
715 | PF2(i+2) \ | 717 | PF2(i + 2) \ |
716 | XO1(i,0) \ | 718 | XO1(i,0) \ |
717 | XO1(i+1,1) \ | 719 | XO1(i + 1, 1) \ |
718 | XO1(i+2,2) \ | 720 | XO1(i + 2, 2) \ |
719 | XO1(i+3,3) \ | 721 | XO1(i + 3, 3) \ |
720 | PF3(i) \ | 722 | PF3(i) \ |
721 | PF3(i+2) \ | 723 | PF3(i + 2) \ |
722 | PF0(i+4) \ | 724 | PF0(i + 4) \ |
723 | PF0(i+6) \ | 725 | PF0(i + 6) \ |
724 | XO2(i,0) \ | 726 | XO2(i,0) \ |
725 | XO2(i+1,1) \ | 727 | XO2(i + 1, 1) \ |
726 | XO2(i+2,2) \ | 728 | XO2(i + 2, 2) \ |
727 | XO2(i+3,3) \ | 729 | XO2(i + 3, 3) \ |
728 | XO3(i,0) \ | 730 | XO3(i,0) \ |
729 | XO3(i+1,1) \ | 731 | XO3(i + 1, 1) \ |
730 | XO3(i+2,2) \ | 732 | XO3(i + 2, 2) \ |
731 | XO3(i+3,3) \ | 733 | XO3(i + 3, 3) \ |
732 | ST(i,0) \ | 734 | ST(i,0) \ |
733 | ST(i+1,1) \ | 735 | ST(i + 1, 1) \ |
734 | ST(i+2,2) \ | 736 | ST(i + 2, 2) \ |
735 | ST(i+3,3) \ | 737 | ST(i + 3, 3) \ |
736 | 738 | ||
737 | 739 | ||
738 | PF0(0) | 740 | PF0(0) |
739 | PF0(2) | 741 | PF0(2) |
740 | 742 | ||
741 | " .align 32 ;\n" | 743 | " .align 32 ;\n" |
742 | " 1: ;\n" | 744 | " 1: ;\n" |
743 | 745 | ||
744 | BLOCK(0) | 746 | BLOCK(0) |
745 | BLOCK(4) | 747 | BLOCK(4) |
746 | BLOCK(8) | 748 | BLOCK(8) |
747 | BLOCK(12) | 749 | BLOCK(12) |
748 | 750 | ||
749 | " addl $256, %1 ;\n" | 751 | " addl $256, %1 ;\n" |
750 | " addl $256, %2 ;\n" | 752 | " addl $256, %2 ;\n" |
751 | " addl $256, %3 ;\n" | 753 | " addl $256, %3 ;\n" |
752 | " addl $256, %4 ;\n" | 754 | " addl $256, %4 ;\n" |
753 | " decl %0 ;\n" | 755 | " decl %0 ;\n" |
754 | " jnz 1b ;\n" | 756 | " jnz 1b ;\n" |
755 | : "+r" (lines), | 757 | : "+r" (lines), |
756 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) | 758 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) |
757 | : | 759 | : |
758 | : "memory" ); | 760 | : "memory" ); |
759 | 761 | ||
760 | XMMS_RESTORE; | 762 | XMMS_RESTORE; |
761 | } | 763 | } |
@@ -764,7 +766,7 @@ static void | |||
764 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 766 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
765 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 767 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
766 | { | 768 | { |
767 | unsigned long lines = bytes >> 8; | 769 | unsigned long lines = bytes >> 8; |
768 | char xmm_save[16*4] ALIGN16; | 770 | char xmm_save[16*4] ALIGN16; |
769 | int cr0; | 771 | int cr0; |
770 | 772 | ||
@@ -776,65 +778,65 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
776 | because we modify p4 and p5 there, but we can't mark them | 778 | because we modify p4 and p5 there, but we can't mark them |
777 | as read/write, otherwise we'd overflow the 10-asm-operands | 779 | as read/write, otherwise we'd overflow the 10-asm-operands |
778 | limit of GCC < 3.1. */ | 780 | limit of GCC < 3.1. */ |
779 | __asm__ ("" : "+r" (p4), "+r" (p5)); | 781 | asm("" : "+r" (p4), "+r" (p5)); |
780 | 782 | ||
781 | __asm__ __volatile__ ( | 783 | asm volatile( |
782 | #undef BLOCK | 784 | #undef BLOCK |
783 | #define BLOCK(i) \ | 785 | #define BLOCK(i) \ |
784 | PF1(i) \ | 786 | PF1(i) \ |
785 | PF1(i+2) \ | 787 | PF1(i + 2) \ |
786 | LD(i,0) \ | 788 | LD(i,0) \ |
787 | LD(i+1,1) \ | 789 | LD(i + 1, 1) \ |
788 | LD(i+2,2) \ | 790 | LD(i + 2, 2) \ |
789 | LD(i+3,3) \ | 791 | LD(i + 3, 3) \ |
790 | PF2(i) \ | 792 | PF2(i) \ |
791 | PF2(i+2) \ | 793 | PF2(i + 2) \ |
792 | XO1(i,0) \ | 794 | XO1(i,0) \ |
793 | XO1(i+1,1) \ | 795 | XO1(i + 1, 1) \ |
794 | XO1(i+2,2) \ | 796 | XO1(i + 2, 2) \ |
795 | XO1(i+3,3) \ | 797 | XO1(i + 3, 3) \ |
796 | PF3(i) \ | 798 | PF3(i) \ |
797 | PF3(i+2) \ | 799 | PF3(i + 2) \ |
798 | XO2(i,0) \ | 800 | XO2(i,0) \ |
799 | XO2(i+1,1) \ | 801 | XO2(i + 1, 1) \ |
800 | XO2(i+2,2) \ | 802 | XO2(i + 2, 2) \ |
801 | XO2(i+3,3) \ | 803 | XO2(i + 3, 3) \ |
802 | PF4(i) \ | 804 | PF4(i) \ |
803 | PF4(i+2) \ | 805 | PF4(i + 2) \ |
804 | PF0(i+4) \ | 806 | PF0(i + 4) \ |
805 | PF0(i+6) \ | 807 | PF0(i + 6) \ |
806 | XO3(i,0) \ | 808 | XO3(i,0) \ |
807 | XO3(i+1,1) \ | 809 | XO3(i + 1, 1) \ |
808 | XO3(i+2,2) \ | 810 | XO3(i + 2, 2) \ |
809 | XO3(i+3,3) \ | 811 | XO3(i + 3, 3) \ |
810 | XO4(i,0) \ | 812 | XO4(i,0) \ |
811 | XO4(i+1,1) \ | 813 | XO4(i + 1, 1) \ |
812 | XO4(i+2,2) \ | 814 | XO4(i + 2, 2) \ |
813 | XO4(i+3,3) \ | 815 | XO4(i + 3, 3) \ |
814 | ST(i,0) \ | 816 | ST(i,0) \ |
815 | ST(i+1,1) \ | 817 | ST(i + 1, 1) \ |
816 | ST(i+2,2) \ | 818 | ST(i + 2, 2) \ |
817 | ST(i+3,3) \ | 819 | ST(i + 3, 3) \ |
818 | 820 | ||
819 | 821 | ||
820 | PF0(0) | 822 | PF0(0) |
821 | PF0(2) | 823 | PF0(2) |
822 | 824 | ||
823 | " .align 32 ;\n" | 825 | " .align 32 ;\n" |
824 | " 1: ;\n" | 826 | " 1: ;\n" |
825 | 827 | ||
826 | BLOCK(0) | 828 | BLOCK(0) |
827 | BLOCK(4) | 829 | BLOCK(4) |
828 | BLOCK(8) | 830 | BLOCK(8) |
829 | BLOCK(12) | 831 | BLOCK(12) |
830 | 832 | ||
831 | " addl $256, %1 ;\n" | 833 | " addl $256, %1 ;\n" |
832 | " addl $256, %2 ;\n" | 834 | " addl $256, %2 ;\n" |
833 | " addl $256, %3 ;\n" | 835 | " addl $256, %3 ;\n" |
834 | " addl $256, %4 ;\n" | 836 | " addl $256, %4 ;\n" |
835 | " addl $256, %5 ;\n" | 837 | " addl $256, %5 ;\n" |
836 | " decl %0 ;\n" | 838 | " decl %0 ;\n" |
837 | " jnz 1b ;\n" | 839 | " jnz 1b ;\n" |
838 | : "+r" (lines), | 840 | : "+r" (lines), |
839 | "+r" (p1), "+r" (p2), "+r" (p3) | 841 | "+r" (p1), "+r" (p2), "+r" (p3) |
840 | : "r" (p4), "r" (p5) | 842 | : "r" (p4), "r" (p5) |
@@ -843,17 +845,17 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
843 | /* p4 and p5 were modified, and now the variables are dead. | 845 | /* p4 and p5 were modified, and now the variables are dead. |
844 | Clobber them just to be sure nobody does something stupid | 846 | Clobber them just to be sure nobody does something stupid |
845 | like assuming they have some legal value. */ | 847 | like assuming they have some legal value. */ |
846 | __asm__ ("" : "=r" (p4), "=r" (p5)); | 848 | asm("" : "=r" (p4), "=r" (p5)); |
847 | 849 | ||
848 | XMMS_RESTORE; | 850 | XMMS_RESTORE; |
849 | } | 851 | } |
850 | 852 | ||
851 | static struct xor_block_template xor_block_pIII_sse = { | 853 | static struct xor_block_template xor_block_pIII_sse = { |
852 | .name = "pIII_sse", | 854 | .name = "pIII_sse", |
853 | .do_2 = xor_sse_2, | 855 | .do_2 = xor_sse_2, |
854 | .do_3 = xor_sse_3, | 856 | .do_3 = xor_sse_3, |
855 | .do_4 = xor_sse_4, | 857 | .do_4 = xor_sse_4, |
856 | .do_5 = xor_sse_5, | 858 | .do_5 = xor_sse_5, |
857 | }; | 859 | }; |
858 | 860 | ||
859 | /* Also try the generic routines. */ | 861 | /* Also try the generic routines. */ |
@@ -861,21 +863,21 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
861 | 863 | ||
862 | #undef XOR_TRY_TEMPLATES | 864 | #undef XOR_TRY_TEMPLATES |
863 | #define XOR_TRY_TEMPLATES \ | 865 | #define XOR_TRY_TEMPLATES \ |
864 | do { \ | 866 | do { \ |
865 | xor_speed(&xor_block_8regs); \ | 867 | xor_speed(&xor_block_8regs); \ |
866 | xor_speed(&xor_block_8regs_p); \ | 868 | xor_speed(&xor_block_8regs_p); \ |
867 | xor_speed(&xor_block_32regs); \ | 869 | xor_speed(&xor_block_32regs); \ |
868 | xor_speed(&xor_block_32regs_p); \ | 870 | xor_speed(&xor_block_32regs_p); \ |
869 | if (cpu_has_xmm) \ | 871 | if (cpu_has_xmm) \ |
870 | xor_speed(&xor_block_pIII_sse); \ | 872 | xor_speed(&xor_block_pIII_sse); \ |
871 | if (cpu_has_mmx) { \ | 873 | if (cpu_has_mmx) { \ |
872 | xor_speed(&xor_block_pII_mmx); \ | 874 | xor_speed(&xor_block_pII_mmx); \ |
873 | xor_speed(&xor_block_p5_mmx); \ | 875 | xor_speed(&xor_block_p5_mmx); \ |
874 | } \ | 876 | } \ |
875 | } while (0) | 877 | } while (0) |
876 | 878 | ||
877 | /* We force the use of the SSE xor block because it can write around L2. | 879 | /* We force the use of the SSE xor block because it can write around L2. |
878 | We may also be able to load into the L1 only depending on how the cpu | 880 | We may also be able to load into the L1 only depending on how the cpu |
879 | deals with a load to a line that is being prefetched. */ | 881 | deals with a load to a line that is being prefetched. */ |
880 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | 882 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
881 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | 883 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |