diff options
author | Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> | 2005-05-01 11:58:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-05-01 11:58:48 -0400 |
commit | d5b63d78f1e75f6c6f04862dfb2f2a4aeffafd4c (patch) | |
tree | e208151fd10b19c774ff51478bf3e857b99d63f9 | |
parent | d637413f3f05b41f678f8004225b33b62274183f (diff) |
[PATCH] fix i386 memcpy
This patch shortens non-constant memcpy() by two bytes and fixes spurious
out-of-line constant memcpy().
# size vmlinux.org vmlinux
text data bss dec hex filename
3954591 1553426 236544 5744561 57a7b1 vmlinux.org
3952615 1553426 236544 5742585 579ff9 vmlinux
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/asm-i386/string.h | 89 |
1 files changed, 61 insertions, 28 deletions
diff --git a/include/asm-i386/string.h b/include/asm-i386/string.h index 1679983d053f..6a78ac58c194 100644 --- a/include/asm-i386/string.h +++ b/include/asm-i386/string.h | |||
@@ -198,47 +198,80 @@ static inline void * __memcpy(void * to, const void * from, size_t n) | |||
198 | int d0, d1, d2; | 198 | int d0, d1, d2; |
199 | __asm__ __volatile__( | 199 | __asm__ __volatile__( |
200 | "rep ; movsl\n\t" | 200 | "rep ; movsl\n\t" |
201 | "testb $2,%b4\n\t" | 201 | "movl %4,%%ecx\n\t" |
202 | "je 1f\n\t" | 202 | "andl $3,%%ecx\n\t" |
203 | "movsw\n" | 203 | #if 1 /* want to pay 2 byte penalty for a chance to skip microcoded rep? */ |
204 | "1:\ttestb $1,%b4\n\t" | 204 | "jz 1f\n\t" |
205 | "je 2f\n\t" | 205 | #endif |
206 | "movsb\n" | 206 | "rep ; movsb\n\t" |
207 | "2:" | 207 | "1:" |
208 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) | 208 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) |
209 | :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) | 209 | : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) |
210 | : "memory"); | 210 | : "memory"); |
211 | return (to); | 211 | return (to); |
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * This looks horribly ugly, but the compiler can optimize it totally, | 215 | * This looks ugly, but the compiler can optimize it totally, |
216 | * as the count is constant. | 216 | * as the count is constant. |
217 | */ | 217 | */ |
218 | static inline void * __constant_memcpy(void * to, const void * from, size_t n) | 218 | static inline void * __constant_memcpy(void * to, const void * from, size_t n) |
219 | { | 219 | { |
220 | if (n <= 128) | 220 | long esi, edi; |
221 | return __builtin_memcpy(to, from, n); | 221 | if (!n) return to; |
222 | 222 | #if 1 /* want to do small copies with non-string ops? */ | |
223 | #define COMMON(x) \ | 223 | switch (n) { |
224 | __asm__ __volatile__( \ | 224 | case 1: *(char*)to = *(char*)from; return to; |
225 | "rep ; movsl" \ | 225 | case 2: *(short*)to = *(short*)from; return to; |
226 | x \ | 226 | case 4: *(int*)to = *(int*)from; return to; |
227 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ | 227 | #if 1 /* including those doable with two moves? */ |
228 | : "0" (n/4),"1" ((long) to),"2" ((long) from) \ | 228 | case 3: *(short*)to = *(short*)from; |
229 | : "memory"); | 229 | *((char*)to+2) = *((char*)from+2); return to; |
230 | { | 230 | case 5: *(int*)to = *(int*)from; |
231 | int d0, d1, d2; | 231 | *((char*)to+4) = *((char*)from+4); return to; |
232 | case 6: *(int*)to = *(int*)from; | ||
233 | *((short*)to+2) = *((short*)from+2); return to; | ||
234 | case 8: *(int*)to = *(int*)from; | ||
235 | *((int*)to+1) = *((int*)from+1); return to; | ||
236 | #endif | ||
237 | } | ||
238 | #endif | ||
239 | esi = (long) from; | ||
240 | edi = (long) to; | ||
241 | if (n >= 5*4) { | ||
242 | /* large block: use rep prefix */ | ||
243 | int ecx; | ||
244 | __asm__ __volatile__( | ||
245 | "rep ; movsl" | ||
246 | : "=&c" (ecx), "=&D" (edi), "=&S" (esi) | ||
247 | : "0" (n/4), "1" (edi),"2" (esi) | ||
248 | : "memory" | ||
249 | ); | ||
250 | } else { | ||
251 | /* small block: don't clobber ecx + smaller code */ | ||
252 | if (n >= 4*4) __asm__ __volatile__("movsl" | ||
253 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
254 | if (n >= 3*4) __asm__ __volatile__("movsl" | ||
255 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
256 | if (n >= 2*4) __asm__ __volatile__("movsl" | ||
257 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
258 | if (n >= 1*4) __asm__ __volatile__("movsl" | ||
259 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
260 | } | ||
232 | switch (n % 4) { | 261 | switch (n % 4) { |
233 | case 0: COMMON(""); return to; | 262 | /* tail */ |
234 | case 1: COMMON("\n\tmovsb"); return to; | 263 | case 0: return to; |
235 | case 2: COMMON("\n\tmovsw"); return to; | 264 | case 1: __asm__ __volatile__("movsb" |
236 | default: COMMON("\n\tmovsw\n\tmovsb"); return to; | 265 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); |
266 | return to; | ||
267 | case 2: __asm__ __volatile__("movsw" | ||
268 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
269 | return to; | ||
270 | default: __asm__ __volatile__("movsw\n\tmovsb" | ||
271 | :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); | ||
272 | return to; | ||
237 | } | 273 | } |
238 | } | 274 | } |
239 | |||
240 | #undef COMMON | ||
241 | } | ||
242 | 275 | ||
243 | #define __HAVE_ARCH_MEMCPY | 276 | #define __HAVE_ARCH_MEMCPY |
244 | 277 | ||