[PATCH] x86_64: Remove optimization for B stepping AMD K8

B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2005-11-05 11:25:54 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-14 22:55:17 -0500
commit: a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree: 11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memcpy.S
parent: a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)
1 files changed, 2 insertions, 91 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef5..92dd80544602 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
 * 
 * Output:
 * rax original destination
+ * 
+ * TODO: check best memcpy for PSC
 */     
        .globl __memcpy
@@ -18,95 +20,6 @@
        .p2align 4
 __memcpy:
 memcpy:         
-        pushq %rbx
-        movq %rdi,%rax
-        movl %edx,%ecx
-        shrl $6,%ecx
-        jz .Lhandle_tail
-        
-        .p2align 4
-.Lloop_64:
-        decl %ecx
-        
-        movq (%rsi),%r11
-        movq 8(%rsi),%r8
-        movq %r11,(%rdi)
-        movq %r8,1*8(%rdi)
-        movq 2*8(%rsi),%r9
-        movq 3*8(%rsi),%r10
-        movq %r9,2*8(%rdi)
-        movq %r10,3*8(%rdi)
-                
-        movq 4*8(%rsi),%r11
-        movq 5*8(%rsi),%r8
-        movq %r11,4*8(%rdi)
-        movq %r8,5*8(%rdi)
-        movq 6*8(%rsi),%r9
-        movq 7*8(%rsi),%r10
-        movq %r9,6*8(%rdi)
-        movq %r10,7*8(%rdi)
-        leaq 64(%rsi),%rsi
-        leaq 64(%rdi),%rdi
-        jnz  .Lloop_64
-.Lhandle_tail:
-        movl %edx,%ecx
-        andl $63,%ecx
-        shrl $3,%ecx
-        jz   .Lhandle_7
-        .p2align 4
-.Lloop_8: 
-        decl %ecx
-        movq (%rsi),%r8
-        movq %r8,(%rdi) 
-        leaq 8(%rdi),%rdi
-        leaq 8(%rsi),%rsi
-        jnz  .Lloop_8
-.Lhandle_7:
-        movl %edx,%ecx
-        andl $7,%ecx
-        jz .Lende
-        .p2align 4
-.Lloop_1:
-        movb (%rsi),%r8b
-        movb %r8b,(%rdi) 
-        incq %rdi
-        incq %rsi
-        decl %ecx
-        jnz .Lloop_1
-        
-.Lende:         
-        popq %rbx
-        ret
-.Lfinal:
-        
-        /* C stepping K8 run faster using the string copy instructions.
-           It is also a lot simpler. Use this when possible */
-        
-        .section .altinstructions,"a"
-        .align 8
-        .quad  memcpy
-        .quad  memcpy_c
-        .byte  X86_FEATURE_K8_C
-        .byte  .Lfinal-memcpy
-        .byte  memcpy_c_end-memcpy_c    
-        .previous
-        .section .altinstr_replacement,"ax"
- /* rdi destination
-  * rsi source
-  * rdx count
-  */                    
-memcpy_c:
        movq %rdi,%rax
        movl %edx,%ecx
        shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
        rep
        movsb
        ret
-memcpy_c_end:
-        .previous
author	Andi Kleen <ak@suse.de>	2005-11-05 11:25:54 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-14 22:55:17 -0500
commit	a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree	11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memcpy.S
parent	a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)

diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index c6c46494fef5..92dd80544602 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
11	*	11	*
12	* Output:	12	* Output:
13	* rax original destination	13	* rax original destination
		14	*
		15	* TODO: check best memcpy for PSC
14	*/	16	*/
15		17
16	.globl __memcpy	18	.globl __memcpy
@@ -18,95 +20,6 @@
18	.p2align 4	20	.p2align 4
19	__memcpy:	21	__memcpy:
20	memcpy:	22	memcpy:
21	pushq %rbx
22	movq %rdi,%rax
23
24	movl %edx,%ecx
25	shrl $6,%ecx
26	jz .Lhandle_tail
27
28	.p2align 4
29	.Lloop_64:
30	decl %ecx
31
32	movq (%rsi),%r11
33	movq 8(%rsi),%r8
34
35	movq %r11,(%rdi)
36	movq %r8,1*8(%rdi)
37
38	movq 2*8(%rsi),%r9
39	movq 3*8(%rsi),%r10
40
41	movq %r9,2*8(%rdi)
42	movq %r10,3*8(%rdi)
43
44	movq 4*8(%rsi),%r11
45	movq 5*8(%rsi),%r8
46
47	movq %r11,4*8(%rdi)
48	movq %r8,5*8(%rdi)
49
50	movq 6*8(%rsi),%r9
51	movq 7*8(%rsi),%r10
52
53	movq %r9,6*8(%rdi)
54	movq %r10,7*8(%rdi)
55
56	leaq 64(%rsi),%rsi
57	leaq 64(%rdi),%rdi
58	jnz .Lloop_64
59
60	.Lhandle_tail:
61	movl %edx,%ecx
62	andl $63,%ecx
63	shrl $3,%ecx
64	jz .Lhandle_7
65	.p2align 4
66	.Lloop_8:
67	decl %ecx
68	movq (%rsi),%r8
69	movq %r8,(%rdi)
70	leaq 8(%rdi),%rdi
71	leaq 8(%rsi),%rsi
72	jnz .Lloop_8
73
74	.Lhandle_7:
75	movl %edx,%ecx
76	andl $7,%ecx
77	jz .Lende
78	.p2align 4
79	.Lloop_1:
80	movb (%rsi),%r8b
81	movb %r8b,(%rdi)
82	incq %rdi
83	incq %rsi
84	decl %ecx
85	jnz .Lloop_1
86
87	.Lende:
88	popq %rbx
89	ret
90	.Lfinal:
91
92	/* C stepping K8 run faster using the string copy instructions.
93	It is also a lot simpler. Use this when possible */
94
95	.section .altinstructions,"a"
96	.align 8
97	.quad memcpy
98	.quad memcpy_c
99	.byte X86_FEATURE_K8_C
100	.byte .Lfinal-memcpy
101	.byte memcpy_c_end-memcpy_c
102	.previous
103
104	.section .altinstr_replacement,"ax"
105	/* rdi destination
106	* rsi source
107	* rdx count
108	*/
109	memcpy_c:
110	movq %rdi,%rax	23	movq %rdi,%rax
111	movl %edx,%ecx	24	movl %edx,%ecx
112	shrl $3,%ecx	25	shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
117	rep	30	rep
118	movsb	31	movsb
119	ret	32	ret
120	memcpy_c_end:
121	.previous