aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/include/asm/xor_64.h
diff options
context:
space:
mode:
authorSuresh Siddha <suresh.b.siddha@intel.com>2012-08-24 17:13:00 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2012-09-18 18:52:08 -0400
commit841e3604d35aa70d399146abdc526d8c89a2c2f5 (patch)
tree80d2266c21e5ae0b5f4097db3ee71888fc92bec1 /arch/x86/include/asm/xor_64.h
parent9c1c3fac53378c9782c18f80107965578d7b7167 (diff)
x86, fpu: always use kernel_fpu_begin/end() for in-kernel FPU usage
use kernel_fpu_begin/end() instead of unconditionally accessing cr0 and saving/restoring just the few used xmm/ymm registers. This has some advantages like: * If the task's FPU state is already active, then kernel_fpu_begin() will just save the user-state and avoiding the read/write of cr0. In general, cr0 accesses are much slower. * Manual save/restore of xmm/ymm registers will affect the 'modified' and the 'init' optimizations brought in the by xsaveopt/xrstor infrastructure. * Foward compatibility with future vector register extensions will be a problem if the xmm/ymm registers are manually saved and restored (corrupting the extended state of those vector registers). With this patch, there was no significant difference in the xor throughput using AVX, measured during boot. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Link: http://lkml.kernel.org/r/1345842782-24175-5-git-send-email-suresh.b.siddha@intel.com Cc: Jim Kukunas <james.t.kukunas@linux.intel.com> Cc: NeilBrown <neilb@suse.de> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/include/asm/xor_64.h')
-rw-r--r--arch/x86/include/asm/xor_64.h61
1 files changed, 9 insertions, 52 deletions
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index b9b2323e90fe..5fc06d0b7eb5 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -34,41 +34,7 @@
34 * no advantages to be gotten from x86-64 here anyways. 34 * no advantages to be gotten from x86-64 here anyways.
35 */ 35 */
36 36
37typedef struct { 37#include <asm/i387.h>
38 unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
40
41/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42 tell it to do a clts before the register saving. */
43#define XMMS_SAVE \
44do { \
45 preempt_disable(); \
46 asm volatile( \
47 "movq %%cr0,%0 ;\n\t" \
48 "clts ;\n\t" \
49 "movups %%xmm0,(%1) ;\n\t" \
50 "movups %%xmm1,0x10(%1) ;\n\t" \
51 "movups %%xmm2,0x20(%1) ;\n\t" \
52 "movups %%xmm3,0x30(%1) ;\n\t" \
53 : "=&r" (cr0) \
54 : "r" (xmm_save) \
55 : "memory"); \
56} while (0)
57
58#define XMMS_RESTORE \
59do { \
60 asm volatile( \
61 "sfence ;\n\t" \
62 "movups (%1),%%xmm0 ;\n\t" \
63 "movups 0x10(%1),%%xmm1 ;\n\t" \
64 "movups 0x20(%1),%%xmm2 ;\n\t" \
65 "movups 0x30(%1),%%xmm3 ;\n\t" \
66 "movq %0,%%cr0 ;\n\t" \
67 : \
68 : "r" (cr0), "r" (xmm_save) \
69 : "memory"); \
70 preempt_enable(); \
71} while (0)
72 38
73#define OFFS(x) "16*("#x")" 39#define OFFS(x) "16*("#x")"
74#define PF_OFFS(x) "256+16*("#x")" 40#define PF_OFFS(x) "256+16*("#x")"
@@ -91,10 +57,8 @@ static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 57xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{ 58{
93 unsigned int lines = bytes >> 8; 59 unsigned int lines = bytes >> 8;
94 unsigned long cr0;
95 xmm_store_t xmm_save[4];
96 60
97 XMMS_SAVE; 61 kernel_fpu_begin();
98 62
99 asm volatile( 63 asm volatile(
100#undef BLOCK 64#undef BLOCK
@@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
135 : [inc] "r" (256UL) 99 : [inc] "r" (256UL)
136 : "memory"); 100 : "memory");
137 101
138 XMMS_RESTORE; 102 kernel_fpu_end();
139} 103}
140 104
141static void 105static void
@@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 unsigned long *p3) 107 unsigned long *p3)
144{ 108{
145 unsigned int lines = bytes >> 8; 109 unsigned int lines = bytes >> 8;
146 xmm_store_t xmm_save[4];
147 unsigned long cr0;
148
149 XMMS_SAVE;
150 110
111 kernel_fpu_begin();
151 asm volatile( 112 asm volatile(
152#undef BLOCK 113#undef BLOCK
153#define BLOCK(i) \ 114#define BLOCK(i) \
@@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
194 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 : [inc] "r" (256UL) 156 : [inc] "r" (256UL)
196 : "memory"); 157 : "memory");
197 XMMS_RESTORE; 158 kernel_fpu_end();
198} 159}
199 160
200static void 161static void
@@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 unsigned long *p3, unsigned long *p4) 163 unsigned long *p3, unsigned long *p4)
203{ 164{
204 unsigned int lines = bytes >> 8; 165 unsigned int lines = bytes >> 8;
205 xmm_store_t xmm_save[4];
206 unsigned long cr0;
207 166
208 XMMS_SAVE; 167 kernel_fpu_begin();
209 168
210 asm volatile( 169 asm volatile(
211#undef BLOCK 170#undef BLOCK
@@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
261 : [inc] "r" (256UL) 220 : [inc] "r" (256UL)
262 : "memory" ); 221 : "memory" );
263 222
264 XMMS_RESTORE; 223 kernel_fpu_end();
265} 224}
266 225
267static void 226static void
@@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 unsigned long *p3, unsigned long *p4, unsigned long *p5) 228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{ 229{
271 unsigned int lines = bytes >> 8; 230 unsigned int lines = bytes >> 8;
272 xmm_store_t xmm_save[4];
273 unsigned long cr0;
274 231
275 XMMS_SAVE; 232 kernel_fpu_begin();
276 233
277 asm volatile( 234 asm volatile(
278#undef BLOCK 235#undef BLOCK
@@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
336 : [inc] "r" (256UL) 293 : [inc] "r" (256UL)
337 : "memory"); 294 : "memory");
338 295
339 XMMS_RESTORE; 296 kernel_fpu_end();
340} 297}
341 298
342static struct xor_block_template xor_block_sse = { 299static struct xor_block_template xor_block_sse = {