[PATCH] x86_64: Remove optimization for B stepping AMD K8

B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2005-11-05 11:25:54 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-14 22:55:17 -0500
commit: a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree: 11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memset.S
parent: a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)
1 files changed, 0 insertions, 94 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
        .p2align 4
 memset: 
 __memset:
-        movq %rdi,%r10
-        movq %rdx,%r11
-        /* expand byte value  */
-        movzbl %sil,%ecx
-        movabs $0x0101010101010101,%rax
-        mul    %rcx             /* with rax, clobbers rdx */
-        /* align dst */
-        movl  %edi,%r9d         
-        andl  $7,%r9d   
-        jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-        
-        movl %r11d,%ecx
-        shrl $6,%ecx
-        jz       .Lhandle_tail
-        .p2align 4
-.Lloop_64:      
-        decl   %ecx
-        movq  %rax,(%rdi) 
-        movq  %rax,8(%rdi) 
-        movq  %rax,16(%rdi) 
-        movq  %rax,24(%rdi) 
-        movq  %rax,32(%rdi) 
-        movq  %rax,40(%rdi) 
-        movq  %rax,48(%rdi) 
-        movq  %rax,56(%rdi) 
-        leaq  64(%rdi),%rdi
-        jnz    .Lloop_64
-        /* Handle tail in loops. The loops should be faster than hard
-           to predict jump tables. */ 
-        .p2align 4         
-.Lhandle_tail:
-        movl    %r11d,%ecx
-        andl    $63&(~7),%ecx
-        jz              .Lhandle_7
-        shrl    $3,%ecx
-        .p2align 4
-.Lloop_8:
-        decl   %ecx
-        movq  %rax,(%rdi)
-        leaq  8(%rdi),%rdi
-        jnz    .Lloop_8
-.Lhandle_7:
-        movl    %r11d,%ecx
-        andl    $7,%ecx
-        jz      .Lende
-        .p2align 4
-.Lloop_1:
-        decl    %ecx
-        movb    %al,(%rdi)
-        leaq    1(%rdi),%rdi
-        jnz     .Lloop_1
-        
-.Lende: 
-        movq    %r10,%rax
-        ret
-.Lbad_alignment:
-        cmpq $7,%r11
-        jbe     .Lhandle_7
-        movq %rax,(%rdi)        /* unaligned store */
-        movq $8,%r8                     
-        subq %r9,%r8 
-        addq %r8,%rdi
-        subq %r8,%r11
-        jmp .Lafter_bad_alignment
-        /* C stepping K8 run faster using the string instructions.
-           It is also a lot simpler. Use this when possible */
-#include <asm/cpufeature.h>     
-                
-        .section .altinstructions,"a"
-        .align 8
-        .quad  memset
-        .quad  memset_c
-        .byte  X86_FEATURE_K8_C
-        .byte  memset_c_end-memset_c
-        .byte  memset_c_end-memset_c
-        .previous
-        .section .altinstr_replacement,"ax"
- /* rdi destination
-  * rsi value
-  * rdx count
-  */                    
-memset_c:       
        movq %rdi,%r9
        movl %edx,%r8d
        andl $7,%r8d            
@@ -121,5 +29,3 @@ memset_c:
        stosb
        movq %r9,%rax
        ret
-memset_c_end:
-        .previous
author	Andi Kleen <ak@suse.de>	2005-11-05 11:25:54 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-14 22:55:17 -0500
commit	a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree	11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memset.S
parent	a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)

diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 4b4c40638640..2aa48f24ed1e 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
13	.p2align 4	13	.p2align 4
14	memset:	14	memset:
15	__memset:	15	__memset:
16	movq %rdi,%r10
17	movq %rdx,%r11
18
19	/* expand byte value */
20	movzbl %sil,%ecx
21	movabs $0x0101010101010101,%rax
22	mul %rcx /* with rax, clobbers rdx */
23
24	/* align dst */
25	movl %edi,%r9d
26	andl $7,%r9d
27	jnz .Lbad_alignment
28	.Lafter_bad_alignment:
29
30	movl %r11d,%ecx
31	shrl $6,%ecx
32	jz .Lhandle_tail
33
34	.p2align 4
35	.Lloop_64:
36	decl %ecx
37	movq %rax,(%rdi)
38	movq %rax,8(%rdi)
39	movq %rax,16(%rdi)
40	movq %rax,24(%rdi)
41	movq %rax,32(%rdi)
42	movq %rax,40(%rdi)
43	movq %rax,48(%rdi)
44	movq %rax,56(%rdi)
45	leaq 64(%rdi),%rdi
46	jnz .Lloop_64
47
48	/* Handle tail in loops. The loops should be faster than hard
49	to predict jump tables. */
50	.p2align 4
51	.Lhandle_tail:
52	movl %r11d,%ecx
53	andl $63&(~7),%ecx
54	jz .Lhandle_7
55	shrl $3,%ecx
56	.p2align 4
57	.Lloop_8:
58	decl %ecx
59	movq %rax,(%rdi)
60	leaq 8(%rdi),%rdi
61	jnz .Lloop_8
62
63	.Lhandle_7:
64	movl %r11d,%ecx
65	andl $7,%ecx
66	jz .Lende
67	.p2align 4
68	.Lloop_1:
69	decl %ecx
70	movb %al,(%rdi)
71	leaq 1(%rdi),%rdi
72	jnz .Lloop_1
73
74	.Lende:
75	movq %r10,%rax
76	ret
77
78	.Lbad_alignment:
79	cmpq $7,%r11
80	jbe .Lhandle_7
81	movq %rax,(%rdi) /* unaligned store */
82	movq $8,%r8
83	subq %r9,%r8
84	addq %r8,%rdi
85	subq %r8,%r11
86	jmp .Lafter_bad_alignment
87
88	/* C stepping K8 run faster using the string instructions.
89	It is also a lot simpler. Use this when possible */
90
91	#include <asm/cpufeature.h>
92
93	.section .altinstructions,"a"
94	.align 8
95	.quad memset
96	.quad memset_c
97	.byte X86_FEATURE_K8_C
98	.byte memset_c_end-memset_c
99	.byte memset_c_end-memset_c
100	.previous
101
102	.section .altinstr_replacement,"ax"
103	/* rdi destination
104	* rsi value
105	* rdx count
106	*/
107	memset_c:
108	movq %rdi,%r9	16	movq %rdi,%r9
109	movl %edx,%r8d	17	movl %edx,%r8d
110	andl $7,%r8d	18	andl $7,%r8d
@@ -121,5 +29,3 @@ memset_c:
121	stosb	29	stosb
122	movq %r9,%rax	30	movq %r9,%rax
123	ret	31	ret
124	memset_c_end:
125	.previous