aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib/memcpy.S
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-02-03 15:51:02 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-04 19:43:13 -0500
commit7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
treef0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/memcpy.S
parent6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions
They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r--arch/x86_64/lib/memcpy.S93
1 files changed, 91 insertions, 2 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd80544602..5554948b5554 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
11 * 11 *
12 * Output: 12 * Output:
13 * rax original destination 13 * rax original destination
14 *
15 * TODO: check best memcpy for PSC
16 */ 14 */
17 15
18 .globl __memcpy 16 .globl __memcpy
@@ -20,6 +18,95 @@
20 .p2align 4 18 .p2align 4
21__memcpy: 19__memcpy:
22memcpy: 20memcpy:
21 pushq %rbx
22 movq %rdi,%rax
23
24 movl %edx,%ecx
25 shrl $6,%ecx
26 jz .Lhandle_tail
27
28 .p2align 4
29.Lloop_64:
30 decl %ecx
31
32 movq (%rsi),%r11
33 movq 8(%rsi),%r8
34
35 movq %r11,(%rdi)
36 movq %r8,1*8(%rdi)
37
38 movq 2*8(%rsi),%r9
39 movq 3*8(%rsi),%r10
40
41 movq %r9,2*8(%rdi)
42 movq %r10,3*8(%rdi)
43
44 movq 4*8(%rsi),%r11
45 movq 5*8(%rsi),%r8
46
47 movq %r11,4*8(%rdi)
48 movq %r8,5*8(%rdi)
49
50 movq 6*8(%rsi),%r9
51 movq 7*8(%rsi),%r10
52
53 movq %r9,6*8(%rdi)
54 movq %r10,7*8(%rdi)
55
56 leaq 64(%rsi),%rsi
57 leaq 64(%rdi),%rdi
58 jnz .Lloop_64
59
60.Lhandle_tail:
61 movl %edx,%ecx
62 andl $63,%ecx
63 shrl $3,%ecx
64 jz .Lhandle_7
65 .p2align 4
66.Lloop_8:
67 decl %ecx
68 movq (%rsi),%r8
69 movq %r8,(%rdi)
70 leaq 8(%rdi),%rdi
71 leaq 8(%rsi),%rsi
72 jnz .Lloop_8
73
74.Lhandle_7:
75 movl %edx,%ecx
76 andl $7,%ecx
77 jz .Lende
78 .p2align 4
79.Lloop_1:
80 movb (%rsi),%r8b
81 movb %r8b,(%rdi)
82 incq %rdi
83 incq %rsi
84 decl %ecx
85 jnz .Lloop_1
86
87.Lende:
88 popq %rbx
89 ret
90.Lfinal:
91
92 /* Some CPUs run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */
94
95 .section .altinstructions,"a"
96 .align 8
97 .quad memcpy
98 .quad memcpy_c
99 .byte X86_FEATURE_REP_GOOD
100 .byte .Lfinal-memcpy
101 .byte memcpy_c_end-memcpy_c
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
23 movq %rdi,%rax 110 movq %rdi,%rax
24 movl %edx,%ecx 111 movl %edx,%ecx
25 shrl $3,%ecx 112 shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
30 rep 117 rep
31 movsb 118 movsb
32 ret 119 ret
120memcpy_c_end:
121 .previous