diff options
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 56 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 61 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_avx.h | 54 |
3 files changed, 29 insertions, 142 deletions
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 454570891bdc..aabd5850bdb9 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
@@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = { | |||
534 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | 534 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) |
535 | */ | 535 | */ |
536 | 536 | ||
537 | #define XMMS_SAVE \ | ||
538 | do { \ | ||
539 | preempt_disable(); \ | ||
540 | cr0 = read_cr0(); \ | ||
541 | clts(); \ | ||
542 | asm volatile( \ | ||
543 | "movups %%xmm0,(%0) ;\n\t" \ | ||
544 | "movups %%xmm1,0x10(%0) ;\n\t" \ | ||
545 | "movups %%xmm2,0x20(%0) ;\n\t" \ | ||
546 | "movups %%xmm3,0x30(%0) ;\n\t" \ | ||
547 | : \ | ||
548 | : "r" (xmm_save) \ | ||
549 | : "memory"); \ | ||
550 | } while (0) | ||
551 | |||
552 | #define XMMS_RESTORE \ | ||
553 | do { \ | ||
554 | asm volatile( \ | ||
555 | "sfence ;\n\t" \ | ||
556 | "movups (%0),%%xmm0 ;\n\t" \ | ||
557 | "movups 0x10(%0),%%xmm1 ;\n\t" \ | ||
558 | "movups 0x20(%0),%%xmm2 ;\n\t" \ | ||
559 | "movups 0x30(%0),%%xmm3 ;\n\t" \ | ||
560 | : \ | ||
561 | : "r" (xmm_save) \ | ||
562 | : "memory"); \ | ||
563 | write_cr0(cr0); \ | ||
564 | preempt_enable(); \ | ||
565 | } while (0) | ||
566 | |||
567 | #define ALIGN16 __attribute__((aligned(16))) | ||
568 | |||
569 | #define OFFS(x) "16*("#x")" | 537 | #define OFFS(x) "16*("#x")" |
570 | #define PF_OFFS(x) "256+16*("#x")" | 538 | #define PF_OFFS(x) "256+16*("#x")" |
571 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" | 539 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" |
@@ -587,10 +555,8 @@ static void | |||
587 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 555 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
588 | { | 556 | { |
589 | unsigned long lines = bytes >> 8; | 557 | unsigned long lines = bytes >> 8; |
590 | char xmm_save[16*4] ALIGN16; | ||
591 | int cr0; | ||
592 | 558 | ||
593 | XMMS_SAVE; | 559 | kernel_fpu_begin(); |
594 | 560 | ||
595 | asm volatile( | 561 | asm volatile( |
596 | #undef BLOCK | 562 | #undef BLOCK |
@@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
633 | : | 599 | : |
634 | : "memory"); | 600 | : "memory"); |
635 | 601 | ||
636 | XMMS_RESTORE; | 602 | kernel_fpu_end(); |
637 | } | 603 | } |
638 | 604 | ||
639 | static void | 605 | static void |
@@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
641 | unsigned long *p3) | 607 | unsigned long *p3) |
642 | { | 608 | { |
643 | unsigned long lines = bytes >> 8; | 609 | unsigned long lines = bytes >> 8; |
644 | char xmm_save[16*4] ALIGN16; | ||
645 | int cr0; | ||
646 | 610 | ||
647 | XMMS_SAVE; | 611 | kernel_fpu_begin(); |
648 | 612 | ||
649 | asm volatile( | 613 | asm volatile( |
650 | #undef BLOCK | 614 | #undef BLOCK |
@@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
694 | : | 658 | : |
695 | : "memory" ); | 659 | : "memory" ); |
696 | 660 | ||
697 | XMMS_RESTORE; | 661 | kernel_fpu_end(); |
698 | } | 662 | } |
699 | 663 | ||
700 | static void | 664 | static void |
@@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
702 | unsigned long *p3, unsigned long *p4) | 666 | unsigned long *p3, unsigned long *p4) |
703 | { | 667 | { |
704 | unsigned long lines = bytes >> 8; | 668 | unsigned long lines = bytes >> 8; |
705 | char xmm_save[16*4] ALIGN16; | ||
706 | int cr0; | ||
707 | 669 | ||
708 | XMMS_SAVE; | 670 | kernel_fpu_begin(); |
709 | 671 | ||
710 | asm volatile( | 672 | asm volatile( |
711 | #undef BLOCK | 673 | #undef BLOCK |
@@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
762 | : | 724 | : |
763 | : "memory" ); | 725 | : "memory" ); |
764 | 726 | ||
765 | XMMS_RESTORE; | 727 | kernel_fpu_end(); |
766 | } | 728 | } |
767 | 729 | ||
768 | static void | 730 | static void |
@@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
770 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 732 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
771 | { | 733 | { |
772 | unsigned long lines = bytes >> 8; | 734 | unsigned long lines = bytes >> 8; |
773 | char xmm_save[16*4] ALIGN16; | ||
774 | int cr0; | ||
775 | 735 | ||
776 | XMMS_SAVE; | 736 | kernel_fpu_begin(); |
777 | 737 | ||
778 | /* Make sure GCC forgets anything it knows about p4 or p5, | 738 | /* Make sure GCC forgets anything it knows about p4 or p5, |
779 | such that it won't pass to the asm volatile below a | 739 | such that it won't pass to the asm volatile below a |
@@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
850 | like assuming they have some legal value. */ | 810 | like assuming they have some legal value. */ |
851 | asm("" : "=r" (p4), "=r" (p5)); | 811 | asm("" : "=r" (p4), "=r" (p5)); |
852 | 812 | ||
853 | XMMS_RESTORE; | 813 | kernel_fpu_end(); |
854 | } | 814 | } |
855 | 815 | ||
856 | static struct xor_block_template xor_block_pIII_sse = { | 816 | static struct xor_block_template xor_block_pIII_sse = { |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index b9b2323e90fe..5fc06d0b7eb5 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
@@ -34,41 +34,7 @@ | |||
34 | * no advantages to be gotten from x86-64 here anyways. | 34 | * no advantages to be gotten from x86-64 here anyways. |
35 | */ | 35 | */ |
36 | 36 | ||
37 | typedef struct { | 37 | #include <asm/i387.h> |
38 | unsigned long a, b; | ||
39 | } __attribute__((aligned(16))) xmm_store_t; | ||
40 | |||
41 | /* Doesn't use gcc to save the XMM registers, because there is no easy way to | ||
42 | tell it to do a clts before the register saving. */ | ||
43 | #define XMMS_SAVE \ | ||
44 | do { \ | ||
45 | preempt_disable(); \ | ||
46 | asm volatile( \ | ||
47 | "movq %%cr0,%0 ;\n\t" \ | ||
48 | "clts ;\n\t" \ | ||
49 | "movups %%xmm0,(%1) ;\n\t" \ | ||
50 | "movups %%xmm1,0x10(%1) ;\n\t" \ | ||
51 | "movups %%xmm2,0x20(%1) ;\n\t" \ | ||
52 | "movups %%xmm3,0x30(%1) ;\n\t" \ | ||
53 | : "=&r" (cr0) \ | ||
54 | : "r" (xmm_save) \ | ||
55 | : "memory"); \ | ||
56 | } while (0) | ||
57 | |||
58 | #define XMMS_RESTORE \ | ||
59 | do { \ | ||
60 | asm volatile( \ | ||
61 | "sfence ;\n\t" \ | ||
62 | "movups (%1),%%xmm0 ;\n\t" \ | ||
63 | "movups 0x10(%1),%%xmm1 ;\n\t" \ | ||
64 | "movups 0x20(%1),%%xmm2 ;\n\t" \ | ||
65 | "movups 0x30(%1),%%xmm3 ;\n\t" \ | ||
66 | "movq %0,%%cr0 ;\n\t" \ | ||
67 | : \ | ||
68 | : "r" (cr0), "r" (xmm_save) \ | ||
69 | : "memory"); \ | ||
70 | preempt_enable(); \ | ||
71 | } while (0) | ||
72 | 38 | ||
73 | #define OFFS(x) "16*("#x")" | 39 | #define OFFS(x) "16*("#x")" |
74 | #define PF_OFFS(x) "256+16*("#x")" | 40 | #define PF_OFFS(x) "256+16*("#x")" |
@@ -91,10 +57,8 @@ static void | |||
91 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 57 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
92 | { | 58 | { |
93 | unsigned int lines = bytes >> 8; | 59 | unsigned int lines = bytes >> 8; |
94 | unsigned long cr0; | ||
95 | xmm_store_t xmm_save[4]; | ||
96 | 60 | ||
97 | XMMS_SAVE; | 61 | kernel_fpu_begin(); |
98 | 62 | ||
99 | asm volatile( | 63 | asm volatile( |
100 | #undef BLOCK | 64 | #undef BLOCK |
@@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
135 | : [inc] "r" (256UL) | 99 | : [inc] "r" (256UL) |
136 | : "memory"); | 100 | : "memory"); |
137 | 101 | ||
138 | XMMS_RESTORE; | 102 | kernel_fpu_end(); |
139 | } | 103 | } |
140 | 104 | ||
141 | static void | 105 | static void |
@@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
143 | unsigned long *p3) | 107 | unsigned long *p3) |
144 | { | 108 | { |
145 | unsigned int lines = bytes >> 8; | 109 | unsigned int lines = bytes >> 8; |
146 | xmm_store_t xmm_save[4]; | ||
147 | unsigned long cr0; | ||
148 | |||
149 | XMMS_SAVE; | ||
150 | 110 | ||
111 | kernel_fpu_begin(); | ||
151 | asm volatile( | 112 | asm volatile( |
152 | #undef BLOCK | 113 | #undef BLOCK |
153 | #define BLOCK(i) \ | 114 | #define BLOCK(i) \ |
@@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
194 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | 155 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) |
195 | : [inc] "r" (256UL) | 156 | : [inc] "r" (256UL) |
196 | : "memory"); | 157 | : "memory"); |
197 | XMMS_RESTORE; | 158 | kernel_fpu_end(); |
198 | } | 159 | } |
199 | 160 | ||
200 | static void | 161 | static void |
@@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
202 | unsigned long *p3, unsigned long *p4) | 163 | unsigned long *p3, unsigned long *p4) |
203 | { | 164 | { |
204 | unsigned int lines = bytes >> 8; | 165 | unsigned int lines = bytes >> 8; |
205 | xmm_store_t xmm_save[4]; | ||
206 | unsigned long cr0; | ||
207 | 166 | ||
208 | XMMS_SAVE; | 167 | kernel_fpu_begin(); |
209 | 168 | ||
210 | asm volatile( | 169 | asm volatile( |
211 | #undef BLOCK | 170 | #undef BLOCK |
@@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
261 | : [inc] "r" (256UL) | 220 | : [inc] "r" (256UL) |
262 | : "memory" ); | 221 | : "memory" ); |
263 | 222 | ||
264 | XMMS_RESTORE; | 223 | kernel_fpu_end(); |
265 | } | 224 | } |
266 | 225 | ||
267 | static void | 226 | static void |
@@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
269 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 228 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
270 | { | 229 | { |
271 | unsigned int lines = bytes >> 8; | 230 | unsigned int lines = bytes >> 8; |
272 | xmm_store_t xmm_save[4]; | ||
273 | unsigned long cr0; | ||
274 | 231 | ||
275 | XMMS_SAVE; | 232 | kernel_fpu_begin(); |
276 | 233 | ||
277 | asm volatile( | 234 | asm volatile( |
278 | #undef BLOCK | 235 | #undef BLOCK |
@@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
336 | : [inc] "r" (256UL) | 293 | : [inc] "r" (256UL) |
337 | : "memory"); | 294 | : "memory"); |
338 | 295 | ||
339 | XMMS_RESTORE; | 296 | kernel_fpu_end(); |
340 | } | 297 | } |
341 | 298 | ||
342 | static struct xor_block_template xor_block_sse = { | 299 | static struct xor_block_template xor_block_sse = { |
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 2510d35f480e..7ea79c5fa1f2 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h | |||
@@ -20,32 +20,6 @@ | |||
20 | #include <linux/compiler.h> | 20 | #include <linux/compiler.h> |
21 | #include <asm/i387.h> | 21 | #include <asm/i387.h> |
22 | 22 | ||
23 | #define ALIGN32 __aligned(32) | ||
24 | |||
25 | #define YMM_SAVED_REGS 4 | ||
26 | |||
27 | #define YMMS_SAVE \ | ||
28 | do { \ | ||
29 | preempt_disable(); \ | ||
30 | cr0 = read_cr0(); \ | ||
31 | clts(); \ | ||
32 | asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ | ||
33 | asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ | ||
34 | asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ | ||
35 | asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ | ||
36 | } while (0); | ||
37 | |||
38 | #define YMMS_RESTORE \ | ||
39 | do { \ | ||
40 | asm volatile("sfence" : : : "memory"); \ | ||
41 | asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ | ||
42 | asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ | ||
43 | asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ | ||
44 | asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ | ||
45 | write_cr0(cr0); \ | ||
46 | preempt_enable(); \ | ||
47 | } while (0); | ||
48 | |||
49 | #define BLOCK4(i) \ | 23 | #define BLOCK4(i) \ |
50 | BLOCK(32 * i, 0) \ | 24 | BLOCK(32 * i, 0) \ |
51 | BLOCK(32 * (i + 1), 1) \ | 25 | BLOCK(32 * (i + 1), 1) \ |
@@ -60,10 +34,9 @@ do { \ | |||
60 | 34 | ||
61 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | 35 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) |
62 | { | 36 | { |
63 | unsigned long cr0, lines = bytes >> 9; | 37 | unsigned long lines = bytes >> 9; |
64 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
65 | 38 | ||
66 | YMMS_SAVE | 39 | kernel_fpu_begin(); |
67 | 40 | ||
68 | while (lines--) { | 41 | while (lines--) { |
69 | #undef BLOCK | 42 | #undef BLOCK |
@@ -82,16 +55,15 @@ do { \ | |||
82 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | 55 | p1 = (unsigned long *)((uintptr_t)p1 + 512); |
83 | } | 56 | } |
84 | 57 | ||
85 | YMMS_RESTORE | 58 | kernel_fpu_end(); |
86 | } | 59 | } |
87 | 60 | ||
88 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | 61 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, |
89 | unsigned long *p2) | 62 | unsigned long *p2) |
90 | { | 63 | { |
91 | unsigned long cr0, lines = bytes >> 9; | 64 | unsigned long lines = bytes >> 9; |
92 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
93 | 65 | ||
94 | YMMS_SAVE | 66 | kernel_fpu_begin(); |
95 | 67 | ||
96 | while (lines--) { | 68 | while (lines--) { |
97 | #undef BLOCK | 69 | #undef BLOCK |
@@ -113,16 +85,15 @@ do { \ | |||
113 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | 85 | p2 = (unsigned long *)((uintptr_t)p2 + 512); |
114 | } | 86 | } |
115 | 87 | ||
116 | YMMS_RESTORE | 88 | kernel_fpu_end(); |
117 | } | 89 | } |
118 | 90 | ||
119 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | 91 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, |
120 | unsigned long *p2, unsigned long *p3) | 92 | unsigned long *p2, unsigned long *p3) |
121 | { | 93 | { |
122 | unsigned long cr0, lines = bytes >> 9; | 94 | unsigned long lines = bytes >> 9; |
123 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
124 | 95 | ||
125 | YMMS_SAVE | 96 | kernel_fpu_begin(); |
126 | 97 | ||
127 | while (lines--) { | 98 | while (lines--) { |
128 | #undef BLOCK | 99 | #undef BLOCK |
@@ -147,16 +118,15 @@ do { \ | |||
147 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | 118 | p3 = (unsigned long *)((uintptr_t)p3 + 512); |
148 | } | 119 | } |
149 | 120 | ||
150 | YMMS_RESTORE | 121 | kernel_fpu_end(); |
151 | } | 122 | } |
152 | 123 | ||
153 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | 124 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, |
154 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | 125 | unsigned long *p2, unsigned long *p3, unsigned long *p4) |
155 | { | 126 | { |
156 | unsigned long cr0, lines = bytes >> 9; | 127 | unsigned long lines = bytes >> 9; |
157 | char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; | ||
158 | 128 | ||
159 | YMMS_SAVE | 129 | kernel_fpu_begin(); |
160 | 130 | ||
161 | while (lines--) { | 131 | while (lines--) { |
162 | #undef BLOCK | 132 | #undef BLOCK |
@@ -184,7 +154,7 @@ do { \ | |||
184 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | 154 | p4 = (unsigned long *)((uintptr_t)p4 + 512); |
185 | } | 155 | } |
186 | 156 | ||
187 | YMMS_RESTORE | 157 | kernel_fpu_end(); |
188 | } | 158 | } |
189 | 159 | ||
190 | static struct xor_block_template xor_block_avx = { | 160 | static struct xor_block_template xor_block_avx = { |