aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/lib/mmx_32.c197
1 files changed, 94 insertions, 103 deletions
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc9b4a4450f3..c9f2d9ba8dd8 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,32 +1,30 @@
1#include <linux/types.h>
2#include <linux/string.h>
3#include <linux/sched.h>
4#include <linux/hardirq.h>
5#include <linux/module.h>
6
7#include <asm/asm.h>
8#include <asm/i387.h>
9
10
11/* 1/*
12 * MMX 3DNow! library helper functions 2 * MMX 3DNow! library helper functions
13 * 3 *
14 * To do: 4 * To do:
15 * We can use MMX just for prefetch in IRQ's. This may be a win. 5 * We can use MMX just for prefetch in IRQ's. This may be a win.
16 * (reported so on K6-III) 6 * (reported so on K6-III)
17 * We should use a better code neutral filler for the short jump 7 * We should use a better code neutral filler for the short jump
18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 8 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 * We also want to clobber the filler register so we don't get any 9 * We also want to clobber the filler register so we don't get any
20 * register forwarding stalls on the filler. 10 * register forwarding stalls on the filler.
21 * 11 *
22 * Add *user handling. Checksums are not a win with MMX on any CPU 12 * Add *user handling. Checksums are not a win with MMX on any CPU
23 * tested so far for any MMX solution figured. 13 * tested so far for any MMX solution figured.
24 * 14 *
25 * 22/09/2000 - Arjan van de Ven 15 * 22/09/2000 - Arjan van de Ven
26 * Improved for non-egineering-sample Athlons 16 * Improved for non-egineering-sample Athlons
27 * 17 *
28 */ 18 */
29 19#include <linux/hardirq.h>
20#include <linux/string.h>
21#include <linux/module.h>
22#include <linux/sched.h>
23#include <linux/types.h>
24
25#include <asm/i387.h>
26#include <asm/asm.h>
27
30void *_mmx_memcpy(void *to, const void *from, size_t len) 28void *_mmx_memcpy(void *to, const void *from, size_t len)
31{ 29{
32 void *p; 30 void *p;
@@ -51,12 +49,10 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 49 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
52 " jmp 2b\n" 50 " jmp 2b\n"
53 ".previous\n" 51 ".previous\n"
54 _ASM_EXTABLE(1b,3b) 52 _ASM_EXTABLE(1b, 3b)
55 : : "r" (from) ); 53 : : "r" (from));
56 54
57 55 for ( ; i > 5; i--) {
58 for(; i>5; i--)
59 {
60 __asm__ __volatile__ ( 56 __asm__ __volatile__ (
61 "1: prefetch 320(%0)\n" 57 "1: prefetch 320(%0)\n"
62 "2: movq (%0), %%mm0\n" 58 "2: movq (%0), %%mm0\n"
@@ -79,14 +75,14 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
79 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 75 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
80 " jmp 2b\n" 76 " jmp 2b\n"
81 ".previous\n" 77 ".previous\n"
82 _ASM_EXTABLE(1b,3b) 78 _ASM_EXTABLE(1b, 3b)
83 : : "r" (from), "r" (to) : "memory"); 79 : : "r" (from), "r" (to) : "memory");
84 from+=64; 80
85 to+=64; 81 from += 64;
82 to += 64;
86 } 83 }
87 84
88 for(; i>0; i--) 85 for ( ; i > 0; i--) {
89 {
90 __asm__ __volatile__ ( 86 __asm__ __volatile__ (
91 " movq (%0), %%mm0\n" 87 " movq (%0), %%mm0\n"
92 " movq 8(%0), %%mm1\n" 88 " movq 8(%0), %%mm1\n"
@@ -104,17 +100,20 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
104 " movq %%mm1, 40(%1)\n" 100 " movq %%mm1, 40(%1)\n"
105 " movq %%mm2, 48(%1)\n" 101 " movq %%mm2, 48(%1)\n"
106 " movq %%mm3, 56(%1)\n" 102 " movq %%mm3, 56(%1)\n"
107 : : "r" (from), "r" (to) : "memory"); 103 : : "r" (from), "r" (to) : "memory");
108 from+=64; 104
109 to+=64; 105 from += 64;
106 to += 64;
110 } 107 }
111 /* 108 /*
112 * Now do the tail of the block 109 * Now do the tail of the block:
113 */ 110 */
114 __memcpy(to, from, len&63); 111 __memcpy(to, from, len & 63);
115 kernel_fpu_end(); 112 kernel_fpu_end();
113
116 return p; 114 return p;
117} 115}
116EXPORT_SYMBOL(_mmx_memcpy);
118 117
119#ifdef CONFIG_MK7 118#ifdef CONFIG_MK7
120 119
@@ -128,13 +127,12 @@ static void fast_clear_page(void *page)
128 int i; 127 int i;
129 128
130 kernel_fpu_begin(); 129 kernel_fpu_begin();
131 130
132 __asm__ __volatile__ ( 131 __asm__ __volatile__ (
133 " pxor %%mm0, %%mm0\n" : : 132 " pxor %%mm0, %%mm0\n" : :
134 ); 133 );
135 134
136 for(i=0;i<4096/64;i++) 135 for (i = 0; i < 4096/64; i++) {
137 {
138 __asm__ __volatile__ ( 136 __asm__ __volatile__ (
139 " movntq %%mm0, (%0)\n" 137 " movntq %%mm0, (%0)\n"
140 " movntq %%mm0, 8(%0)\n" 138 " movntq %%mm0, 8(%0)\n"
@@ -145,14 +143,15 @@ static void fast_clear_page(void *page)
145 " movntq %%mm0, 48(%0)\n" 143 " movntq %%mm0, 48(%0)\n"
146 " movntq %%mm0, 56(%0)\n" 144 " movntq %%mm0, 56(%0)\n"
147 : : "r" (page) : "memory"); 145 : : "r" (page) : "memory");
148 page+=64; 146 page += 64;
149 } 147 }
150 /* since movntq is weakly-ordered, a "sfence" is needed to become 148
151 * ordered again. 149 /*
150 * Since movntq is weakly-ordered, a "sfence" is needed to become
151 * ordered again:
152 */ 152 */
153 __asm__ __volatile__ ( 153 __asm__ __volatile__("sfence\n"::);
154 " sfence \n" : : 154
155 );
156 kernel_fpu_end(); 155 kernel_fpu_end();
157} 156}
158 157
@@ -162,10 +161,11 @@ static void fast_copy_page(void *to, void *from)
162 161
163 kernel_fpu_begin(); 162 kernel_fpu_begin();
164 163
165 /* maybe the prefetch stuff can go before the expensive fnsave... 164 /*
165 * maybe the prefetch stuff can go before the expensive fnsave...
166 * but that is for later. -AV 166 * but that is for later. -AV
167 */ 167 */
168 __asm__ __volatile__ ( 168 __asm__ __volatile__(
169 "1: prefetch (%0)\n" 169 "1: prefetch (%0)\n"
170 " prefetch 64(%0)\n" 170 " prefetch 64(%0)\n"
171 " prefetch 128(%0)\n" 171 " prefetch 128(%0)\n"
@@ -176,11 +176,9 @@ static void fast_copy_page(void *to, void *from)
176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
177 " jmp 2b\n" 177 " jmp 2b\n"
178 ".previous\n" 178 ".previous\n"
179 _ASM_EXTABLE(1b,3b) 179 _ASM_EXTABLE(1b, 3b) : : "r" (from));
180 : : "r" (from) );
181 180
182 for(i=0; i<(4096-320)/64; i++) 181 for (i = 0; i < (4096-320)/64; i++) {
183 {
184 __asm__ __volatile__ ( 182 __asm__ __volatile__ (
185 "1: prefetch 320(%0)\n" 183 "1: prefetch 320(%0)\n"
186 "2: movq (%0), %%mm0\n" 184 "2: movq (%0), %%mm0\n"
@@ -203,13 +201,13 @@ static void fast_copy_page(void *to, void *from)
203 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 201 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
204 " jmp 2b\n" 202 " jmp 2b\n"
205 ".previous\n" 203 ".previous\n"
206 _ASM_EXTABLE(1b,3b) 204 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
207 : : "r" (from), "r" (to) : "memory"); 205
208 from+=64; 206 from += 64;
209 to+=64; 207 to += 64;
210 } 208 }
211 for(i=(4096-320)/64; i<4096/64; i++) 209
212 { 210 for (i = (4096-320)/64; i < 4096/64; i++) {
213 __asm__ __volatile__ ( 211 __asm__ __volatile__ (
214 "2: movq (%0), %%mm0\n" 212 "2: movq (%0), %%mm0\n"
215 " movntq %%mm0, (%1)\n" 213 " movntq %%mm0, (%1)\n"
@@ -227,37 +225,34 @@ static void fast_copy_page(void *to, void *from)
227 " movntq %%mm6, 48(%1)\n" 225 " movntq %%mm6, 48(%1)\n"
228 " movq 56(%0), %%mm7\n" 226 " movq 56(%0), %%mm7\n"
229 " movntq %%mm7, 56(%1)\n" 227 " movntq %%mm7, 56(%1)\n"
230 : : "r" (from), "r" (to) : "memory"); 228 : : "r" (from), "r" (to) : "memory");
231 from+=64; 229 from += 64;
232 to+=64; 230 to += 64;
233 } 231 }
234 /* since movntq is weakly-ordered, a "sfence" is needed to become 232 /*
235 * ordered again. 233 * Since movntq is weakly-ordered, a "sfence" is needed to become
234 * ordered again:
236 */ 235 */
237 __asm__ __volatile__ ( 236 __asm__ __volatile__("sfence \n"::);
238 " sfence \n" : :
239 );
240 kernel_fpu_end(); 237 kernel_fpu_end();
241} 238}
242 239
243#else 240#else /* CONFIG_MK7 */
244 241
245/* 242/*
246 * Generic MMX implementation without K7 specific streaming 243 * Generic MMX implementation without K7 specific streaming
247 */ 244 */
248
249static void fast_clear_page(void *page) 245static void fast_clear_page(void *page)
250{ 246{
251 int i; 247 int i;
252 248
253 kernel_fpu_begin(); 249 kernel_fpu_begin();
254 250
255 __asm__ __volatile__ ( 251 __asm__ __volatile__ (
256 " pxor %%mm0, %%mm0\n" : : 252 " pxor %%mm0, %%mm0\n" : :
257 ); 253 );
258 254
259 for(i=0;i<4096/128;i++) 255 for (i = 0; i < 4096/128; i++) {
260 {
261 __asm__ __volatile__ ( 256 __asm__ __volatile__ (
262 " movq %%mm0, (%0)\n" 257 " movq %%mm0, (%0)\n"
263 " movq %%mm0, 8(%0)\n" 258 " movq %%mm0, 8(%0)\n"
@@ -275,8 +270,8 @@ static void fast_clear_page(void *page)
275 " movq %%mm0, 104(%0)\n" 270 " movq %%mm0, 104(%0)\n"
276 " movq %%mm0, 112(%0)\n" 271 " movq %%mm0, 112(%0)\n"
277 " movq %%mm0, 120(%0)\n" 272 " movq %%mm0, 120(%0)\n"
278 : : "r" (page) : "memory"); 273 : : "r" (page) : "memory");
279 page+=128; 274 page += 128;
280 } 275 }
281 276
282 kernel_fpu_end(); 277 kernel_fpu_end();
@@ -285,8 +280,7 @@ static void fast_clear_page(void *page)
285static void fast_copy_page(void *to, void *from) 280static void fast_copy_page(void *to, void *from)
286{ 281{
287 int i; 282 int i;
288 283
289
290 kernel_fpu_begin(); 284 kernel_fpu_begin();
291 285
292 __asm__ __volatile__ ( 286 __asm__ __volatile__ (
@@ -300,11 +294,9 @@ static void fast_copy_page(void *to, void *from)
300 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 294 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
301 " jmp 2b\n" 295 " jmp 2b\n"
302 ".previous\n" 296 ".previous\n"
303 _ASM_EXTABLE(1b,3b) 297 _ASM_EXTABLE(1b, 3b) : : "r" (from));
304 : : "r" (from) );
305 298
306 for(i=0; i<4096/64; i++) 299 for (i = 0; i < 4096/64; i++) {
307 {
308 __asm__ __volatile__ ( 300 __asm__ __volatile__ (
309 "1: prefetch 320(%0)\n" 301 "1: prefetch 320(%0)\n"
310 "2: movq (%0), %%mm0\n" 302 "2: movq (%0), %%mm0\n"
@@ -327,60 +319,59 @@ static void fast_copy_page(void *to, void *from)
327 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 319 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
328 " jmp 2b\n" 320 " jmp 2b\n"
329 ".previous\n" 321 ".previous\n"
330 _ASM_EXTABLE(1b,3b) 322 _ASM_EXTABLE(1b, 3b)
331 : : "r" (from), "r" (to) : "memory"); 323 : : "r" (from), "r" (to) : "memory");
332 from+=64; 324
333 to+=64; 325 from += 64;
326 to += 64;
334 } 327 }
335 kernel_fpu_end(); 328 kernel_fpu_end();
336} 329}
337 330
338 331#endif /* !CONFIG_MK7 */
339#endif
340 332
341/* 333/*
342 * Favour MMX for page clear and copy. 334 * Favour MMX for page clear and copy:
343 */ 335 */
344 336static void slow_zero_page(void *page)
345static void slow_zero_page(void * page)
346{ 337{
347 int d0, d1; 338 int d0, d1;
348 __asm__ __volatile__( \ 339
349 "cld\n\t" \ 340 __asm__ __volatile__(
350 "rep ; stosl" \ 341 "cld\n\t"
351 : "=&c" (d0), "=&D" (d1) 342 "rep ; stosl"
352 :"a" (0),"1" (page),"0" (1024) 343
353 :"memory"); 344 : "=&c" (d0), "=&D" (d1)
345 :"a" (0), "1" (page), "0" (1024)
346 :"memory");
354} 347}
355 348
356void mmx_clear_page(void * page) 349void mmx_clear_page(void *page)
357{ 350{
358 if(unlikely(in_interrupt())) 351 if (unlikely(in_interrupt()))
359 slow_zero_page(page); 352 slow_zero_page(page);
360 else 353 else
361 fast_clear_page(page); 354 fast_clear_page(page);
362} 355}
356EXPORT_SYMBOL(mmx_clear_page);
363 357
364static void slow_copy_page(void *to, void *from) 358static void slow_copy_page(void *to, void *from)
365{ 359{
366 int d0, d1, d2; 360 int d0, d1, d2;
367 __asm__ __volatile__( \ 361
368 "cld\n\t" \ 362 __asm__ __volatile__(
369 "rep ; movsl" \ 363 "cld\n\t"
370 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ 364 "rep ; movsl"
371 : "0" (1024),"1" ((long) to),"2" ((long) from) \ 365 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
366 : "0" (1024), "1" ((long) to), "2" ((long) from)
372 : "memory"); 367 : "memory");
373} 368}
374
375 369
376void mmx_copy_page(void *to, void *from) 370void mmx_copy_page(void *to, void *from)
377{ 371{
378 if(unlikely(in_interrupt())) 372 if (unlikely(in_interrupt()))
379 slow_copy_page(to, from); 373 slow_copy_page(to, from);
380 else 374 else
381 fast_copy_page(to, from); 375 fast_copy_page(to, from);
382} 376}
383
384EXPORT_SYMBOL(_mmx_memcpy);
385EXPORT_SYMBOL(mmx_clear_page);
386EXPORT_SYMBOL(mmx_copy_page); 377EXPORT_SYMBOL(mmx_copy_page);