aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/asm-x86/xor_32.h494
1 files changed, 248 insertions, 246 deletions
diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h
index a41ef1bdd424..067b5c1835a3 100644
--- a/include/asm-x86/xor_32.h
+++ b/include/asm-x86/xor_32.h
@@ -16,12 +16,12 @@
16 * Copyright (C) 1998 Ingo Molnar. 16 * Copyright (C) 1998 Ingo Molnar.
17 */ 17 */
18 18
19#define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" 19#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
20#define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" 20#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
21#define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" 21#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
22#define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" 22#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
23#define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" 23#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
24#define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" 24#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
25 25
26#include <asm/i387.h> 26#include <asm/i387.h>
27 27
@@ -32,24 +32,24 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
32 32
33 kernel_fpu_begin(); 33 kernel_fpu_begin();
34 34
35 __asm__ __volatile__ ( 35 asm volatile(
36#undef BLOCK 36#undef BLOCK
37#define BLOCK(i) \ 37#define BLOCK(i) \
38 LD(i,0) \ 38 LD(i, 0) \
39 LD(i+1,1) \ 39 LD(i + 1, 1) \
40 LD(i+2,2) \ 40 LD(i + 2, 2) \
41 LD(i+3,3) \ 41 LD(i + 3, 3) \
42 XO1(i,0) \ 42 XO1(i, 0) \
43 ST(i,0) \ 43 ST(i, 0) \
44 XO1(i+1,1) \ 44 XO1(i+1, 1) \
45 ST(i+1,1) \ 45 ST(i+1, 1) \
46 XO1(i+2,2) \ 46 XO1(i + 2, 2) \
47 ST(i+2,2) \ 47 ST(i + 2, 2) \
48 XO1(i+3,3) \ 48 XO1(i + 3, 3) \
49 ST(i+3,3) 49 ST(i + 3, 3)
50 50
51 " .align 32 ;\n" 51 " .align 32 ;\n"
52 " 1: ;\n" 52 " 1: ;\n"
53 53
54 BLOCK(0) 54 BLOCK(0)
55 BLOCK(4) 55 BLOCK(4)
@@ -76,25 +76,25 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 76
77 kernel_fpu_begin(); 77 kernel_fpu_begin();
78 78
79 __asm__ __volatile__ ( 79 asm volatile(
80#undef BLOCK 80#undef BLOCK
81#define BLOCK(i) \ 81#define BLOCK(i) \
82 LD(i,0) \ 82 LD(i, 0) \
83 LD(i+1,1) \ 83 LD(i + 1, 1) \
84 LD(i+2,2) \ 84 LD(i + 2, 2) \
85 LD(i+3,3) \ 85 LD(i + 3, 3) \
86 XO1(i,0) \ 86 XO1(i, 0) \
87 XO1(i+1,1) \ 87 XO1(i + 1, 1) \
88 XO1(i+2,2) \ 88 XO1(i + 2, 2) \
89 XO1(i+3,3) \ 89 XO1(i + 3, 3) \
90 XO2(i,0) \ 90 XO2(i, 0) \
91 ST(i,0) \ 91 ST(i, 0) \
92 XO2(i+1,1) \ 92 XO2(i + 1, 1) \
93 ST(i+1,1) \ 93 ST(i + 1, 1) \
94 XO2(i+2,2) \ 94 XO2(i + 2, 2) \
95 ST(i+2,2) \ 95 ST(i + 2, 2) \
96 XO2(i+3,3) \ 96 XO2(i + 3, 3) \
97 ST(i+3,3) 97 ST(i + 3, 3)
98 98
99 " .align 32 ;\n" 99 " .align 32 ;\n"
100 " 1: ;\n" 100 " 1: ;\n"
@@ -125,29 +125,29 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 125
126 kernel_fpu_begin(); 126 kernel_fpu_begin();
127 127
128 __asm__ __volatile__ ( 128 asm volatile(
129#undef BLOCK 129#undef BLOCK
130#define BLOCK(i) \ 130#define BLOCK(i) \
131 LD(i,0) \ 131 LD(i, 0) \
132 LD(i+1,1) \ 132 LD(i + 1, 1) \
133 LD(i+2,2) \ 133 LD(i + 2, 2) \
134 LD(i+3,3) \ 134 LD(i + 3, 3) \
135 XO1(i,0) \ 135 XO1(i, 0) \
136 XO1(i+1,1) \ 136 XO1(i + 1, 1) \
137 XO1(i+2,2) \ 137 XO1(i + 2, 2) \
138 XO1(i+3,3) \ 138 XO1(i + 3, 3) \
139 XO2(i,0) \ 139 XO2(i, 0) \
140 XO2(i+1,1) \ 140 XO2(i + 1, 1) \
141 XO2(i+2,2) \ 141 XO2(i + 2, 2) \
142 XO2(i+3,3) \ 142 XO2(i + 3, 3) \
143 XO3(i,0) \ 143 XO3(i, 0) \
144 ST(i,0) \ 144 ST(i, 0) \
145 XO3(i+1,1) \ 145 XO3(i + 1, 1) \
146 ST(i+1,1) \ 146 ST(i + 1, 1) \
147 XO3(i+2,2) \ 147 XO3(i + 2, 2) \
148 ST(i+2,2) \ 148 ST(i + 2, 2) \
149 XO3(i+3,3) \ 149 XO3(i + 3, 3) \
150 ST(i+3,3) 150 ST(i + 3, 3)
151 151
152 " .align 32 ;\n" 152 " .align 32 ;\n"
153 " 1: ;\n" 153 " 1: ;\n"
@@ -186,35 +186,35 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
186 because we modify p4 and p5 there, but we can't mark them 186 because we modify p4 and p5 there, but we can't mark them
187 as read/write, otherwise we'd overflow the 10-asm-operands 187 as read/write, otherwise we'd overflow the 10-asm-operands
188 limit of GCC < 3.1. */ 188 limit of GCC < 3.1. */
189 __asm__ ("" : "+r" (p4), "+r" (p5)); 189 asm("" : "+r" (p4), "+r" (p5));
190 190
191 __asm__ __volatile__ ( 191 asm volatile(
192#undef BLOCK 192#undef BLOCK
193#define BLOCK(i) \ 193#define BLOCK(i) \
194 LD(i,0) \ 194 LD(i, 0) \
195 LD(i+1,1) \ 195 LD(i + 1, 1) \
196 LD(i+2,2) \ 196 LD(i + 2, 2) \
197 LD(i+3,3) \ 197 LD(i + 3, 3) \
198 XO1(i,0) \ 198 XO1(i, 0) \
199 XO1(i+1,1) \ 199 XO1(i + 1, 1) \
200 XO1(i+2,2) \ 200 XO1(i + 2, 2) \
201 XO1(i+3,3) \ 201 XO1(i + 3, 3) \
202 XO2(i,0) \ 202 XO2(i, 0) \
203 XO2(i+1,1) \ 203 XO2(i + 1, 1) \
204 XO2(i+2,2) \ 204 XO2(i + 2, 2) \
205 XO2(i+3,3) \ 205 XO2(i + 3, 3) \
206 XO3(i,0) \ 206 XO3(i, 0) \
207 XO3(i+1,1) \ 207 XO3(i + 1, 1) \
208 XO3(i+2,2) \ 208 XO3(i + 2, 2) \
209 XO3(i+3,3) \ 209 XO3(i + 3, 3) \
210 XO4(i,0) \ 210 XO4(i, 0) \
211 ST(i,0) \ 211 ST(i, 0) \
212 XO4(i+1,1) \ 212 XO4(i + 1, 1) \
213 ST(i+1,1) \ 213 ST(i + 1, 1) \
214 XO4(i+2,2) \ 214 XO4(i + 2, 2) \
215 ST(i+2,2) \ 215 ST(i + 2, 2) \
216 XO4(i+3,3) \ 216 XO4(i + 3, 3) \
217 ST(i+3,3) 217 ST(i + 3, 3)
218 218
219 " .align 32 ;\n" 219 " .align 32 ;\n"
220 " 1: ;\n" 220 " 1: ;\n"
@@ -233,13 +233,13 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
233 " jnz 1b ;\n" 233 " jnz 1b ;\n"
234 : "+r" (lines), 234 : "+r" (lines),
235 "+r" (p1), "+r" (p2), "+r" (p3) 235 "+r" (p1), "+r" (p2), "+r" (p3)
236 : "r" (p4), "r" (p5) 236 : "r" (p4), "r" (p5)
237 : "memory"); 237 : "memory");
238 238
239 /* p4 and p5 were modified, and now the variables are dead. 239 /* p4 and p5 were modified, and now the variables are dead.
240 Clobber them just to be sure nobody does something stupid 240 Clobber them just to be sure nobody does something stupid
241 like assuming they have some legal value. */ 241 like assuming they have some legal value. */
242 __asm__ ("" : "=r" (p4), "=r" (p5)); 242 asm("" : "=r" (p4), "=r" (p5));
243 243
244 kernel_fpu_end(); 244 kernel_fpu_end();
245} 245}
@@ -259,7 +259,7 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
259 259
260 kernel_fpu_begin(); 260 kernel_fpu_begin();
261 261
262 __asm__ __volatile__ ( 262 asm volatile(
263 " .align 32 ;\n" 263 " .align 32 ;\n"
264 " 1: ;\n" 264 " 1: ;\n"
265 " movq (%1), %%mm0 ;\n" 265 " movq (%1), %%mm0 ;\n"
@@ -286,7 +286,7 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
286 " pxor 56(%2), %%mm7 ;\n" 286 " pxor 56(%2), %%mm7 ;\n"
287 " movq %%mm6, 48(%1) ;\n" 287 " movq %%mm6, 48(%1) ;\n"
288 " movq %%mm7, 56(%1) ;\n" 288 " movq %%mm7, 56(%1) ;\n"
289 289
290 " addl $64, %1 ;\n" 290 " addl $64, %1 ;\n"
291 " addl $64, %2 ;\n" 291 " addl $64, %2 ;\n"
292 " decl %0 ;\n" 292 " decl %0 ;\n"
@@ -307,7 +307,7 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 307
308 kernel_fpu_begin(); 308 kernel_fpu_begin();
309 309
310 __asm__ __volatile__ ( 310 asm volatile(
311 " .align 32,0x90 ;\n" 311 " .align 32,0x90 ;\n"
312 " 1: ;\n" 312 " 1: ;\n"
313 " movq (%1), %%mm0 ;\n" 313 " movq (%1), %%mm0 ;\n"
@@ -342,7 +342,7 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
342 " pxor 56(%3), %%mm7 ;\n" 342 " pxor 56(%3), %%mm7 ;\n"
343 " movq %%mm6, 48(%1) ;\n" 343 " movq %%mm6, 48(%1) ;\n"
344 " movq %%mm7, 56(%1) ;\n" 344 " movq %%mm7, 56(%1) ;\n"
345 345
346 " addl $64, %1 ;\n" 346 " addl $64, %1 ;\n"
347 " addl $64, %2 ;\n" 347 " addl $64, %2 ;\n"
348 " addl $64, %3 ;\n" 348 " addl $64, %3 ;\n"
@@ -364,7 +364,7 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 364
365 kernel_fpu_begin(); 365 kernel_fpu_begin();
366 366
367 __asm__ __volatile__ ( 367 asm volatile(
368 " .align 32,0x90 ;\n" 368 " .align 32,0x90 ;\n"
369 " 1: ;\n" 369 " 1: ;\n"
370 " movq (%1), %%mm0 ;\n" 370 " movq (%1), %%mm0 ;\n"
@@ -407,7 +407,7 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
407 " pxor 56(%4), %%mm7 ;\n" 407 " pxor 56(%4), %%mm7 ;\n"
408 " movq %%mm6, 48(%1) ;\n" 408 " movq %%mm6, 48(%1) ;\n"
409 " movq %%mm7, 56(%1) ;\n" 409 " movq %%mm7, 56(%1) ;\n"
410 410
411 " addl $64, %1 ;\n" 411 " addl $64, %1 ;\n"
412 " addl $64, %2 ;\n" 412 " addl $64, %2 ;\n"
413 " addl $64, %3 ;\n" 413 " addl $64, %3 ;\n"
@@ -436,9 +436,9 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
436 because we modify p4 and p5 there, but we can't mark them 436 because we modify p4 and p5 there, but we can't mark them
437 as read/write, otherwise we'd overflow the 10-asm-operands 437 as read/write, otherwise we'd overflow the 10-asm-operands
438 limit of GCC < 3.1. */ 438 limit of GCC < 3.1. */
439 __asm__ ("" : "+r" (p4), "+r" (p5)); 439 asm("" : "+r" (p4), "+r" (p5));
440 440
441 __asm__ __volatile__ ( 441 asm volatile(
442 " .align 32,0x90 ;\n" 442 " .align 32,0x90 ;\n"
443 " 1: ;\n" 443 " 1: ;\n"
444 " movq (%1), %%mm0 ;\n" 444 " movq (%1), %%mm0 ;\n"
@@ -489,7 +489,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
489 " pxor 56(%5), %%mm7 ;\n" 489 " pxor 56(%5), %%mm7 ;\n"
490 " movq %%mm6, 48(%1) ;\n" 490 " movq %%mm6, 48(%1) ;\n"
491 " movq %%mm7, 56(%1) ;\n" 491 " movq %%mm7, 56(%1) ;\n"
492 492
493 " addl $64, %1 ;\n" 493 " addl $64, %1 ;\n"
494 " addl $64, %2 ;\n" 494 " addl $64, %2 ;\n"
495 " addl $64, %3 ;\n" 495 " addl $64, %3 ;\n"
@@ -505,7 +505,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
505 /* p4 and p5 were modified, and now the variables are dead. 505 /* p4 and p5 were modified, and now the variables are dead.
506 Clobber them just to be sure nobody does something stupid 506 Clobber them just to be sure nobody does something stupid
507 like assuming they have some legal value. */ 507 like assuming they have some legal value. */
508 __asm__ ("" : "=r" (p4), "=r" (p5)); 508 asm("" : "=r" (p4), "=r" (p5));
509 509
510 kernel_fpu_end(); 510 kernel_fpu_end();
511} 511}
@@ -531,11 +531,12 @@ static struct xor_block_template xor_block_p5_mmx = {
531 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 531 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
532 */ 532 */
533 533
534#define XMMS_SAVE do { \ 534#define XMMS_SAVE \
535do { \
535 preempt_disable(); \ 536 preempt_disable(); \
536 cr0 = read_cr0(); \ 537 cr0 = read_cr0(); \
537 clts(); \ 538 clts(); \
538 __asm__ __volatile__ ( \ 539 asm volatile( \
539 "movups %%xmm0,(%0) ;\n\t" \ 540 "movups %%xmm0,(%0) ;\n\t" \
540 "movups %%xmm1,0x10(%0) ;\n\t" \ 541 "movups %%xmm1,0x10(%0) ;\n\t" \
541 "movups %%xmm2,0x20(%0) ;\n\t" \ 542 "movups %%xmm2,0x20(%0) ;\n\t" \
@@ -543,10 +544,11 @@ static struct xor_block_template xor_block_p5_mmx = {
543 : \ 544 : \
544 : "r" (xmm_save) \ 545 : "r" (xmm_save) \
545 : "memory"); \ 546 : "memory"); \
546} while(0) 547} while (0)
547 548
548#define XMMS_RESTORE do { \ 549#define XMMS_RESTORE \
549 __asm__ __volatile__ ( \ 550do { \
551 asm volatile( \
550 "sfence ;\n\t" \ 552 "sfence ;\n\t" \
551 "movups (%0),%%xmm0 ;\n\t" \ 553 "movups (%0),%%xmm0 ;\n\t" \
552 "movups 0x10(%0),%%xmm1 ;\n\t" \ 554 "movups 0x10(%0),%%xmm1 ;\n\t" \
@@ -557,76 +559,76 @@ static struct xor_block_template xor_block_p5_mmx = {
557 : "memory"); \ 559 : "memory"); \
558 write_cr0(cr0); \ 560 write_cr0(cr0); \
559 preempt_enable(); \ 561 preempt_enable(); \
560} while(0) 562} while (0)
561 563
562#define ALIGN16 __attribute__((aligned(16))) 564#define ALIGN16 __attribute__((aligned(16)))
563 565
564#define OFFS(x) "16*("#x")" 566#define OFFS(x) "16*("#x")"
565#define PF_OFFS(x) "256+16*("#x")" 567#define PF_OFFS(x) "256+16*("#x")"
566#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" 568#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
567#define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" 569#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
568#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" 570#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
569#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" 571#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
570#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" 572#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
571#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" 573#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
572#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" 574#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
573#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" 575#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
574#define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" 576#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
575#define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" 577#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
576#define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" 578#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
577#define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" 579#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
578#define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" 580#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
579 581
580 582
581static void 583static void
582xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 584xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
583{ 585{
584 unsigned long lines = bytes >> 8; 586 unsigned long lines = bytes >> 8;
585 char xmm_save[16*4] ALIGN16; 587 char xmm_save[16*4] ALIGN16;
586 int cr0; 588 int cr0;
587 589
588 XMMS_SAVE; 590 XMMS_SAVE;
589 591
590 __asm__ __volatile__ ( 592 asm volatile(
591#undef BLOCK 593#undef BLOCK
592#define BLOCK(i) \ 594#define BLOCK(i) \
593 LD(i,0) \ 595 LD(i, 0) \
594 LD(i+1,1) \ 596 LD(i + 1, 1) \
595 PF1(i) \ 597 PF1(i) \
596 PF1(i+2) \ 598 PF1(i + 2) \
597 LD(i+2,2) \ 599 LD(i + 2, 2) \
598 LD(i+3,3) \ 600 LD(i + 3, 3) \
599 PF0(i+4) \ 601 PF0(i + 4) \
600 PF0(i+6) \ 602 PF0(i + 6) \
601 XO1(i,0) \ 603 XO1(i, 0) \
602 XO1(i+1,1) \ 604 XO1(i + 1, 1) \
603 XO1(i+2,2) \ 605 XO1(i + 2, 2) \
604 XO1(i+3,3) \ 606 XO1(i + 3, 3) \
605 ST(i,0) \ 607 ST(i, 0) \
606 ST(i+1,1) \ 608 ST(i + 1, 1) \
607 ST(i+2,2) \ 609 ST(i + 2, 2) \
608 ST(i+3,3) \ 610 ST(i + 3, 3) \
609 611
610 612
611 PF0(0) 613 PF0(0)
612 PF0(2) 614 PF0(2)
613 615
614 " .align 32 ;\n" 616 " .align 32 ;\n"
615 " 1: ;\n" 617 " 1: ;\n"
616 618
617 BLOCK(0) 619 BLOCK(0)
618 BLOCK(4) 620 BLOCK(4)
619 BLOCK(8) 621 BLOCK(8)
620 BLOCK(12) 622 BLOCK(12)
621 623
622 " addl $256, %1 ;\n" 624 " addl $256, %1 ;\n"
623 " addl $256, %2 ;\n" 625 " addl $256, %2 ;\n"
624 " decl %0 ;\n" 626 " decl %0 ;\n"
625 " jnz 1b ;\n" 627 " jnz 1b ;\n"
626 : "+r" (lines), 628 : "+r" (lines),
627 "+r" (p1), "+r" (p2) 629 "+r" (p1), "+r" (p2)
628 : 630 :
629 : "memory"); 631 : "memory");
630 632
631 XMMS_RESTORE; 633 XMMS_RESTORE;
632} 634}
@@ -635,59 +637,59 @@ static void
635xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 637xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
636 unsigned long *p3) 638 unsigned long *p3)
637{ 639{
638 unsigned long lines = bytes >> 8; 640 unsigned long lines = bytes >> 8;
639 char xmm_save[16*4] ALIGN16; 641 char xmm_save[16*4] ALIGN16;
640 int cr0; 642 int cr0;
641 643
642 XMMS_SAVE; 644 XMMS_SAVE;
643 645
644 __asm__ __volatile__ ( 646 asm volatile(
645#undef BLOCK 647#undef BLOCK
646#define BLOCK(i) \ 648#define BLOCK(i) \
647 PF1(i) \ 649 PF1(i) \
648 PF1(i+2) \ 650 PF1(i + 2) \
649 LD(i,0) \ 651 LD(i,0) \
650 LD(i+1,1) \ 652 LD(i + 1, 1) \
651 LD(i+2,2) \ 653 LD(i + 2, 2) \
652 LD(i+3,3) \ 654 LD(i + 3, 3) \
653 PF2(i) \ 655 PF2(i) \
654 PF2(i+2) \ 656 PF2(i + 2) \
655 PF0(i+4) \ 657 PF0(i + 4) \
656 PF0(i+6) \ 658 PF0(i + 6) \
657 XO1(i,0) \ 659 XO1(i,0) \
658 XO1(i+1,1) \ 660 XO1(i + 1, 1) \
659 XO1(i+2,2) \ 661 XO1(i + 2, 2) \
660 XO1(i+3,3) \ 662 XO1(i + 3, 3) \
661 XO2(i,0) \ 663 XO2(i,0) \
662 XO2(i+1,1) \ 664 XO2(i + 1, 1) \
663 XO2(i+2,2) \ 665 XO2(i + 2, 2) \
664 XO2(i+3,3) \ 666 XO2(i + 3, 3) \
665 ST(i,0) \ 667 ST(i,0) \
666 ST(i+1,1) \ 668 ST(i + 1, 1) \
667 ST(i+2,2) \ 669 ST(i + 2, 2) \
668 ST(i+3,3) \ 670 ST(i + 3, 3) \
669 671
670 672
671 PF0(0) 673 PF0(0)
672 PF0(2) 674 PF0(2)
673 675
674 " .align 32 ;\n" 676 " .align 32 ;\n"
675 " 1: ;\n" 677 " 1: ;\n"
676 678
677 BLOCK(0) 679 BLOCK(0)
678 BLOCK(4) 680 BLOCK(4)
679 BLOCK(8) 681 BLOCK(8)
680 BLOCK(12) 682 BLOCK(12)
681 683
682 " addl $256, %1 ;\n" 684 " addl $256, %1 ;\n"
683 " addl $256, %2 ;\n" 685 " addl $256, %2 ;\n"
684 " addl $256, %3 ;\n" 686 " addl $256, %3 ;\n"
685 " decl %0 ;\n" 687 " decl %0 ;\n"
686 " jnz 1b ;\n" 688 " jnz 1b ;\n"
687 : "+r" (lines), 689 : "+r" (lines),
688 "+r" (p1), "+r"(p2), "+r"(p3) 690 "+r" (p1), "+r"(p2), "+r"(p3)
689 : 691 :
690 : "memory" ); 692 : "memory" );
691 693
692 XMMS_RESTORE; 694 XMMS_RESTORE;
693} 695}
@@ -696,66 +698,66 @@ static void
696xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 698xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
697 unsigned long *p3, unsigned long *p4) 699 unsigned long *p3, unsigned long *p4)
698{ 700{
699 unsigned long lines = bytes >> 8; 701 unsigned long lines = bytes >> 8;
700 char xmm_save[16*4] ALIGN16; 702 char xmm_save[16*4] ALIGN16;
701 int cr0; 703 int cr0;
702 704
703 XMMS_SAVE; 705 XMMS_SAVE;
704 706
705 __asm__ __volatile__ ( 707 asm volatile(
706#undef BLOCK 708#undef BLOCK
707#define BLOCK(i) \ 709#define BLOCK(i) \
708 PF1(i) \ 710 PF1(i) \
709 PF1(i+2) \ 711 PF1(i + 2) \
710 LD(i,0) \ 712 LD(i,0) \
711 LD(i+1,1) \ 713 LD(i + 1, 1) \
712 LD(i+2,2) \ 714 LD(i + 2, 2) \
713 LD(i+3,3) \ 715 LD(i + 3, 3) \
714 PF2(i) \ 716 PF2(i) \
715 PF2(i+2) \ 717 PF2(i + 2) \
716 XO1(i,0) \ 718 XO1(i,0) \
717 XO1(i+1,1) \ 719 XO1(i + 1, 1) \
718 XO1(i+2,2) \ 720 XO1(i + 2, 2) \
719 XO1(i+3,3) \ 721 XO1(i + 3, 3) \
720 PF3(i) \ 722 PF3(i) \
721 PF3(i+2) \ 723 PF3(i + 2) \
722 PF0(i+4) \ 724 PF0(i + 4) \
723 PF0(i+6) \ 725 PF0(i + 6) \
724 XO2(i,0) \ 726 XO2(i,0) \
725 XO2(i+1,1) \ 727 XO2(i + 1, 1) \
726 XO2(i+2,2) \ 728 XO2(i + 2, 2) \
727 XO2(i+3,3) \ 729 XO2(i + 3, 3) \
728 XO3(i,0) \ 730 XO3(i,0) \
729 XO3(i+1,1) \ 731 XO3(i + 1, 1) \
730 XO3(i+2,2) \ 732 XO3(i + 2, 2) \
731 XO3(i+3,3) \ 733 XO3(i + 3, 3) \
732 ST(i,0) \ 734 ST(i,0) \
733 ST(i+1,1) \ 735 ST(i + 1, 1) \
734 ST(i+2,2) \ 736 ST(i + 2, 2) \
735 ST(i+3,3) \ 737 ST(i + 3, 3) \
736 738
737 739
738 PF0(0) 740 PF0(0)
739 PF0(2) 741 PF0(2)
740 742
741 " .align 32 ;\n" 743 " .align 32 ;\n"
742 " 1: ;\n" 744 " 1: ;\n"
743 745
744 BLOCK(0) 746 BLOCK(0)
745 BLOCK(4) 747 BLOCK(4)
746 BLOCK(8) 748 BLOCK(8)
747 BLOCK(12) 749 BLOCK(12)
748 750
749 " addl $256, %1 ;\n" 751 " addl $256, %1 ;\n"
750 " addl $256, %2 ;\n" 752 " addl $256, %2 ;\n"
751 " addl $256, %3 ;\n" 753 " addl $256, %3 ;\n"
752 " addl $256, %4 ;\n" 754 " addl $256, %4 ;\n"
753 " decl %0 ;\n" 755 " decl %0 ;\n"
754 " jnz 1b ;\n" 756 " jnz 1b ;\n"
755 : "+r" (lines), 757 : "+r" (lines),
756 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) 758 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
757 : 759 :
758 : "memory" ); 760 : "memory" );
759 761
760 XMMS_RESTORE; 762 XMMS_RESTORE;
761} 763}
@@ -764,7 +766,7 @@ static void
764xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 766xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
765 unsigned long *p3, unsigned long *p4, unsigned long *p5) 767 unsigned long *p3, unsigned long *p4, unsigned long *p5)
766{ 768{
767 unsigned long lines = bytes >> 8; 769 unsigned long lines = bytes >> 8;
768 char xmm_save[16*4] ALIGN16; 770 char xmm_save[16*4] ALIGN16;
769 int cr0; 771 int cr0;
770 772
@@ -776,65 +778,65 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
776 because we modify p4 and p5 there, but we can't mark them 778 because we modify p4 and p5 there, but we can't mark them
777 as read/write, otherwise we'd overflow the 10-asm-operands 779 as read/write, otherwise we'd overflow the 10-asm-operands
778 limit of GCC < 3.1. */ 780 limit of GCC < 3.1. */
779 __asm__ ("" : "+r" (p4), "+r" (p5)); 781 asm("" : "+r" (p4), "+r" (p5));
780 782
781 __asm__ __volatile__ ( 783 asm volatile(
782#undef BLOCK 784#undef BLOCK
783#define BLOCK(i) \ 785#define BLOCK(i) \
784 PF1(i) \ 786 PF1(i) \
785 PF1(i+2) \ 787 PF1(i + 2) \
786 LD(i,0) \ 788 LD(i,0) \
787 LD(i+1,1) \ 789 LD(i + 1, 1) \
788 LD(i+2,2) \ 790 LD(i + 2, 2) \
789 LD(i+3,3) \ 791 LD(i + 3, 3) \
790 PF2(i) \ 792 PF2(i) \
791 PF2(i+2) \ 793 PF2(i + 2) \
792 XO1(i,0) \ 794 XO1(i,0) \
793 XO1(i+1,1) \ 795 XO1(i + 1, 1) \
794 XO1(i+2,2) \ 796 XO1(i + 2, 2) \
795 XO1(i+3,3) \ 797 XO1(i + 3, 3) \
796 PF3(i) \ 798 PF3(i) \
797 PF3(i+2) \ 799 PF3(i + 2) \
798 XO2(i,0) \ 800 XO2(i,0) \
799 XO2(i+1,1) \ 801 XO2(i + 1, 1) \
800 XO2(i+2,2) \ 802 XO2(i + 2, 2) \
801 XO2(i+3,3) \ 803 XO2(i + 3, 3) \
802 PF4(i) \ 804 PF4(i) \
803 PF4(i+2) \ 805 PF4(i + 2) \
804 PF0(i+4) \ 806 PF0(i + 4) \
805 PF0(i+6) \ 807 PF0(i + 6) \
806 XO3(i,0) \ 808 XO3(i,0) \
807 XO3(i+1,1) \ 809 XO3(i + 1, 1) \
808 XO3(i+2,2) \ 810 XO3(i + 2, 2) \
809 XO3(i+3,3) \ 811 XO3(i + 3, 3) \
810 XO4(i,0) \ 812 XO4(i,0) \
811 XO4(i+1,1) \ 813 XO4(i + 1, 1) \
812 XO4(i+2,2) \ 814 XO4(i + 2, 2) \
813 XO4(i+3,3) \ 815 XO4(i + 3, 3) \
814 ST(i,0) \ 816 ST(i,0) \
815 ST(i+1,1) \ 817 ST(i + 1, 1) \
816 ST(i+2,2) \ 818 ST(i + 2, 2) \
817 ST(i+3,3) \ 819 ST(i + 3, 3) \
818 820
819 821
820 PF0(0) 822 PF0(0)
821 PF0(2) 823 PF0(2)
822 824
823 " .align 32 ;\n" 825 " .align 32 ;\n"
824 " 1: ;\n" 826 " 1: ;\n"
825 827
826 BLOCK(0) 828 BLOCK(0)
827 BLOCK(4) 829 BLOCK(4)
828 BLOCK(8) 830 BLOCK(8)
829 BLOCK(12) 831 BLOCK(12)
830 832
831 " addl $256, %1 ;\n" 833 " addl $256, %1 ;\n"
832 " addl $256, %2 ;\n" 834 " addl $256, %2 ;\n"
833 " addl $256, %3 ;\n" 835 " addl $256, %3 ;\n"
834 " addl $256, %4 ;\n" 836 " addl $256, %4 ;\n"
835 " addl $256, %5 ;\n" 837 " addl $256, %5 ;\n"
836 " decl %0 ;\n" 838 " decl %0 ;\n"
837 " jnz 1b ;\n" 839 " jnz 1b ;\n"
838 : "+r" (lines), 840 : "+r" (lines),
839 "+r" (p1), "+r" (p2), "+r" (p3) 841 "+r" (p1), "+r" (p2), "+r" (p3)
840 : "r" (p4), "r" (p5) 842 : "r" (p4), "r" (p5)
@@ -843,17 +845,17 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
843 /* p4 and p5 were modified, and now the variables are dead. 845 /* p4 and p5 were modified, and now the variables are dead.
844 Clobber them just to be sure nobody does something stupid 846 Clobber them just to be sure nobody does something stupid
845 like assuming they have some legal value. */ 847 like assuming they have some legal value. */
846 __asm__ ("" : "=r" (p4), "=r" (p5)); 848 asm("" : "=r" (p4), "=r" (p5));
847 849
848 XMMS_RESTORE; 850 XMMS_RESTORE;
849} 851}
850 852
851static struct xor_block_template xor_block_pIII_sse = { 853static struct xor_block_template xor_block_pIII_sse = {
852 .name = "pIII_sse", 854 .name = "pIII_sse",
853 .do_2 = xor_sse_2, 855 .do_2 = xor_sse_2,
854 .do_3 = xor_sse_3, 856 .do_3 = xor_sse_3,
855 .do_4 = xor_sse_4, 857 .do_4 = xor_sse_4,
856 .do_5 = xor_sse_5, 858 .do_5 = xor_sse_5,
857}; 859};
858 860
859/* Also try the generic routines. */ 861/* Also try the generic routines. */
@@ -861,21 +863,21 @@ static struct xor_block_template xor_block_pIII_sse = {
861 863
862#undef XOR_TRY_TEMPLATES 864#undef XOR_TRY_TEMPLATES
863#define XOR_TRY_TEMPLATES \ 865#define XOR_TRY_TEMPLATES \
864 do { \ 866do { \
865 xor_speed(&xor_block_8regs); \ 867 xor_speed(&xor_block_8regs); \
866 xor_speed(&xor_block_8regs_p); \ 868 xor_speed(&xor_block_8regs_p); \
867 xor_speed(&xor_block_32regs); \ 869 xor_speed(&xor_block_32regs); \
868 xor_speed(&xor_block_32regs_p); \ 870 xor_speed(&xor_block_32regs_p); \
869 if (cpu_has_xmm) \ 871 if (cpu_has_xmm) \
870 xor_speed(&xor_block_pIII_sse); \ 872 xor_speed(&xor_block_pIII_sse); \
871 if (cpu_has_mmx) { \ 873 if (cpu_has_mmx) { \
872 xor_speed(&xor_block_pII_mmx); \ 874 xor_speed(&xor_block_pII_mmx); \
873 xor_speed(&xor_block_p5_mmx); \ 875 xor_speed(&xor_block_p5_mmx); \
874 } \ 876 } \
875 } while (0) 877} while (0)
876 878
877/* We force the use of the SSE xor block because it can write around L2. 879/* We force the use of the SSE xor block because it can write around L2.
878 We may also be able to load into the L1 only depending on how the cpu 880 We may also be able to load into the L1 only depending on how the cpu
879 deals with a load to a line that is being prefetched. */ 881 deals with a load to a line that is being prefetched. */
880#define XOR_SELECT_TEMPLATE(FASTEST) \ 882#define XOR_SELECT_TEMPLATE(FASTEST) \
881 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 883 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)