[PATCH] x86_64: Remove optimization for B stepping AMD K8

B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2005-11-05 11:25:54 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-14 22:55:17 -0500
commit: a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree: 11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch
parent: a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)
4 files changed, 2 insertions, 310 deletions
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 30a9da458c15..43d9fa136180 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
        .globl clear_page
        .p2align 4
 clear_page:
-        xorl   %eax,%eax
-        movl   $4096/64,%ecx
-        .p2align 4
-.Lloop:
-        decl    %ecx
-#define PUT(x) movq %rax,x*8(%rdi) 
-        movq %rax,(%rdi)
-        PUT(1)
-        PUT(2)
-        PUT(3)
-        PUT(4)
-        PUT(5)
-        PUT(6)
-        PUT(7)
-        leaq    64(%rdi),%rdi
-        jnz     .Lloop
-        nop
-        ret
-clear_page_end: 
-        
-        /* C stepping K8 run faster using the string instructions.
-           It is also a lot simpler. Use this when possible */
-        
-#include <asm/cpufeature.h>
-                
-        .section .altinstructions,"a"
-        .align 8
-        .quad  clear_page
-        .quad  clear_page_c
-        .byte  X86_FEATURE_K8_C
-        .byte  clear_page_end-clear_page        
-        .byte  clear_page_c_end-clear_page_c
-        .previous
-        .section .altinstr_replacement,"ax"
-clear_page_c:
        movl $4096/8,%ecx
        xorl %eax,%eax
        rep 
        stosq
        ret
-clear_page_c_end:
-        .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index dd3aa47b6bf5..621a19769406 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
        .globl copy_page
        .p2align 4
 copy_page:
-        subq    $3*8,%rsp
-        movq    %rbx,(%rsp)
-        movq    %r12,1*8(%rsp)
-        movq    %r13,2*8(%rsp)
-                        
-        movl    $(4096/64)-5,%ecx
-        .p2align 4
-.Loop64:        
-        dec     %rcx
-        movq        (%rsi), %rax
-        movq      8 (%rsi), %rbx
-        movq     16 (%rsi), %rdx
-        movq     24 (%rsi), %r8
-        movq     32 (%rsi), %r9
-        movq     40 (%rsi), %r10
-        movq     48 (%rsi), %r11
-        movq     56 (%rsi), %r12
-        prefetcht0 5*64(%rsi)
-        movq     %rax,    (%rdi)
-        movq     %rbx,  8 (%rdi)
-        movq     %rdx, 16 (%rdi)
-        movq     %r8,  24 (%rdi)
-        movq     %r9,  32 (%rdi)
-        movq     %r10, 40 (%rdi)
-        movq     %r11, 48 (%rdi)
-        movq     %r12, 56 (%rdi)
-        leaq    64 (%rsi), %rsi
-        leaq    64 (%rdi), %rdi
-        jnz     .Loop64
-        movl    $5,%ecx
-        .p2align 4
-.Loop2: 
-        decl   %ecx
-        movq        (%rsi), %rax
-        movq      8 (%rsi), %rbx
-        movq     16 (%rsi), %rdx
-        movq     24 (%rsi), %r8
-        movq     32 (%rsi), %r9
-        movq     40 (%rsi), %r10
-        movq     48 (%rsi), %r11
-        movq     56 (%rsi), %r12
-        movq     %rax,    (%rdi)
-        movq     %rbx,  8 (%rdi)
-        movq     %rdx, 16 (%rdi)
-        movq     %r8,  24 (%rdi)
-        movq     %r9,  32 (%rdi)
-        movq     %r10, 40 (%rdi)
-        movq     %r11, 48 (%rdi)
-        movq     %r12, 56 (%rdi)
-        
-        leaq    64(%rdi),%rdi                   
-        leaq    64(%rsi),%rsi                   
-        
-        jnz     .Loop2          
-        
-        movq    (%rsp),%rbx
-        movq    1*8(%rsp),%r12
-        movq    2*8(%rsp),%r13
-        addq    $3*8,%rsp
-        ret
-        
-        /* C stepping K8 run faster using the string copy instructions.
-           It is also a lot simpler. Use this when possible */
-#include <asm/cpufeature.h>             
-                
-        .section .altinstructions,"a"
-        .align 8
-        .quad  copy_page
-        .quad  copy_page_c
-        .byte  X86_FEATURE_K8_C
-        .byte  copy_page_c_end-copy_page_c
-        .byte  copy_page_c_end-copy_page_c
-        .previous
-        .section .altinstr_replacement,"ax"
-copy_page_c:
        movl $4096/8,%ecx
        rep 
        movsq 
        ret
-copy_page_c_end:
-        .previous
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef5..92dd80544602 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
 * 
 * Output:
 * rax original destination
+ * 
+ * TODO: check best memcpy for PSC
 */     
        .globl __memcpy
@@ -18,95 +20,6 @@
        .p2align 4
 __memcpy:
 memcpy:         
-        pushq %rbx
-        movq %rdi,%rax
-        movl %edx,%ecx
-        shrl $6,%ecx
-        jz .Lhandle_tail
-        
-        .p2align 4
-.Lloop_64:
-        decl %ecx
-        
-        movq (%rsi),%r11
-        movq 8(%rsi),%r8
-        movq %r11,(%rdi)
-        movq %r8,1*8(%rdi)
-        movq 2*8(%rsi),%r9
-        movq 3*8(%rsi),%r10
-        movq %r9,2*8(%rdi)
-        movq %r10,3*8(%rdi)
-                
-        movq 4*8(%rsi),%r11
-        movq 5*8(%rsi),%r8
-        movq %r11,4*8(%rdi)
-        movq %r8,5*8(%rdi)
-        movq 6*8(%rsi),%r9
-        movq 7*8(%rsi),%r10
-        movq %r9,6*8(%rdi)
-        movq %r10,7*8(%rdi)
-        leaq 64(%rsi),%rsi
-        leaq 64(%rdi),%rdi
-        jnz  .Lloop_64
-.Lhandle_tail:
-        movl %edx,%ecx
-        andl $63,%ecx
-        shrl $3,%ecx
-        jz   .Lhandle_7
-        .p2align 4
-.Lloop_8: 
-        decl %ecx
-        movq (%rsi),%r8
-        movq %r8,(%rdi) 
-        leaq 8(%rdi),%rdi
-        leaq 8(%rsi),%rsi
-        jnz  .Lloop_8
-.Lhandle_7:
-        movl %edx,%ecx
-        andl $7,%ecx
-        jz .Lende
-        .p2align 4
-.Lloop_1:
-        movb (%rsi),%r8b
-        movb %r8b,(%rdi) 
-        incq %rdi
-        incq %rsi
-        decl %ecx
-        jnz .Lloop_1
-        
-.Lende:         
-        popq %rbx
-        ret
-.Lfinal:
-        
-        /* C stepping K8 run faster using the string copy instructions.
-           It is also a lot simpler. Use this when possible */
-        
-        .section .altinstructions,"a"
-        .align 8
-        .quad  memcpy
-        .quad  memcpy_c
-        .byte  X86_FEATURE_K8_C
-        .byte  .Lfinal-memcpy
-        .byte  memcpy_c_end-memcpy_c    
-        .previous
-        .section .altinstr_replacement,"ax"
- /* rdi destination
-  * rsi source
-  * rdx count
-  */                    
-memcpy_c:
        movq %rdi,%rax
        movl %edx,%ecx
        shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
        rep
        movsb
        ret
-memcpy_c_end:
-        .previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
        .p2align 4
 memset: 
 __memset:
-        movq %rdi,%r10
-        movq %rdx,%r11
-        /* expand byte value  */
-        movzbl %sil,%ecx
-        movabs $0x0101010101010101,%rax
-        mul    %rcx             /* with rax, clobbers rdx */
-        /* align dst */
-        movl  %edi,%r9d         
-        andl  $7,%r9d   
-        jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-        
-        movl %r11d,%ecx
-        shrl $6,%ecx
-        jz       .Lhandle_tail
-        .p2align 4
-.Lloop_64:      
-        decl   %ecx
-        movq  %rax,(%rdi) 
-        movq  %rax,8(%rdi) 
-        movq  %rax,16(%rdi) 
-        movq  %rax,24(%rdi) 
-        movq  %rax,32(%rdi) 
-        movq  %rax,40(%rdi) 
-        movq  %rax,48(%rdi) 
-        movq  %rax,56(%rdi) 
-        leaq  64(%rdi),%rdi
-        jnz    .Lloop_64
-        /* Handle tail in loops. The loops should be faster than hard
-           to predict jump tables. */ 
-        .p2align 4         
-.Lhandle_tail:
-        movl    %r11d,%ecx
-        andl    $63&(~7),%ecx
-        jz              .Lhandle_7
-        shrl    $3,%ecx
-        .p2align 4
-.Lloop_8:
-        decl   %ecx
-        movq  %rax,(%rdi)
-        leaq  8(%rdi),%rdi
-        jnz    .Lloop_8
-.Lhandle_7:
-        movl    %r11d,%ecx
-        andl    $7,%ecx
-        jz      .Lende
-        .p2align 4
-.Lloop_1:
-        decl    %ecx
-        movb    %al,(%rdi)
-        leaq    1(%rdi),%rdi
-        jnz     .Lloop_1
-        
-.Lende: 
-        movq    %r10,%rax
-        ret
-.Lbad_alignment:
-        cmpq $7,%r11
-        jbe     .Lhandle_7
-        movq %rax,(%rdi)        /* unaligned store */
-        movq $8,%r8                     
-        subq %r9,%r8 
-        addq %r8,%rdi
-        subq %r8,%r11
-        jmp .Lafter_bad_alignment
-        /* C stepping K8 run faster using the string instructions.
-           It is also a lot simpler. Use this when possible */
-#include <asm/cpufeature.h>     
-                
-        .section .altinstructions,"a"
-        .align 8
-        .quad  memset
-        .quad  memset_c
-        .byte  X86_FEATURE_K8_C
-        .byte  memset_c_end-memset_c
-        .byte  memset_c_end-memset_c
-        .previous
-        .section .altinstr_replacement,"ax"
- /* rdi destination
-  * rsi value
-  * rdx count
-  */                    
-memset_c:       
        movq %rdi,%r9
        movl %edx,%r8d
        andl $7,%r8d            
@@ -121,5 +29,3 @@ memset_c:
        stosb
        movq %r9,%rax
        ret
-memset_c_end:
-        .previous
author	Andi Kleen <ak@suse.de>	2005-11-05 11:25:54 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-14 22:55:17 -0500
commit	a5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree	11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch
parent	a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)

diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 30a9da458c15..43d9fa136180 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
5	.globl clear_page	5	.globl clear_page
6	.p2align 4	6	.p2align 4
7	clear_page:	7	clear_page:
8	xorl %eax,%eax
9	movl $4096/64,%ecx
10	.p2align 4
11	.Lloop:
12	decl %ecx
13	#define PUT(x) movq %rax,x*8(%rdi)
14	movq %rax,(%rdi)
15	PUT(1)
16	PUT(2)
17	PUT(3)
18	PUT(4)
19	PUT(5)
20	PUT(6)
21	PUT(7)
22	leaq 64(%rdi),%rdi
23	jnz .Lloop
24	nop
25	ret
26	clear_page_end:
27
28	/* C stepping K8 run faster using the string instructions.
29	It is also a lot simpler. Use this when possible */
30
31	#include <asm/cpufeature.h>
32
33	.section .altinstructions,"a"
34	.align 8
35	.quad clear_page
36	.quad clear_page_c
37	.byte X86_FEATURE_K8_C
38	.byte clear_page_end-clear_page
39	.byte clear_page_c_end-clear_page_c
40	.previous
41
42	.section .altinstr_replacement,"ax"
43	clear_page_c:
44	movl $4096/8,%ecx	8	movl $4096/8,%ecx
45	xorl %eax,%eax	9	xorl %eax,%eax
46	rep	10	rep
47	stosq	11	stosq
48	ret	12	ret
49	clear_page_c_end:
50	.previous


diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index dd3aa47b6bf5..621a19769406 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
8	.globl copy_page	8	.globl copy_page
9	.p2align 4	9	.p2align 4
10	copy_page:	10	copy_page:
11	subq $3*8,%rsp
12	movq %rbx,(%rsp)
13	movq %r12,1*8(%rsp)
14	movq %r13,2*8(%rsp)
15
16	movl $(4096/64)-5,%ecx
17	.p2align 4
18	.Loop64:
19	dec %rcx
20
21	movq (%rsi), %rax
22	movq 8 (%rsi), %rbx
23	movq 16 (%rsi), %rdx
24	movq 24 (%rsi), %r8
25	movq 32 (%rsi), %r9
26	movq 40 (%rsi), %r10
27	movq 48 (%rsi), %r11
28	movq 56 (%rsi), %r12
29
30	prefetcht0 5*64(%rsi)
31
32	movq %rax, (%rdi)
33	movq %rbx, 8 (%rdi)
34	movq %rdx, 16 (%rdi)
35	movq %r8, 24 (%rdi)
36	movq %r9, 32 (%rdi)
37	movq %r10, 40 (%rdi)
38	movq %r11, 48 (%rdi)
39	movq %r12, 56 (%rdi)
40
41	leaq 64 (%rsi), %rsi
42	leaq 64 (%rdi), %rdi
43
44	jnz .Loop64
45
46	movl $5,%ecx
47	.p2align 4
48	.Loop2:
49	decl %ecx
50
51	movq (%rsi), %rax
52	movq 8 (%rsi), %rbx
53	movq 16 (%rsi), %rdx
54	movq 24 (%rsi), %r8
55	movq 32 (%rsi), %r9
56	movq 40 (%rsi), %r10
57	movq 48 (%rsi), %r11
58	movq 56 (%rsi), %r12
59
60	movq %rax, (%rdi)
61	movq %rbx, 8 (%rdi)
62	movq %rdx, 16 (%rdi)
63	movq %r8, 24 (%rdi)
64	movq %r9, 32 (%rdi)
65	movq %r10, 40 (%rdi)
66	movq %r11, 48 (%rdi)
67	movq %r12, 56 (%rdi)
68
69	leaq 64(%rdi),%rdi
70	leaq 64(%rsi),%rsi
71
72	jnz .Loop2
73
74	movq (%rsp),%rbx
75	movq 1*8(%rsp),%r12
76	movq 2*8(%rsp),%r13
77	addq $3*8,%rsp
78	ret
79
80	/* C stepping K8 run faster using the string copy instructions.
81	It is also a lot simpler. Use this when possible */
82
83	#include <asm/cpufeature.h>
84
85	.section .altinstructions,"a"
86	.align 8
87	.quad copy_page
88	.quad copy_page_c
89	.byte X86_FEATURE_K8_C
90	.byte copy_page_c_end-copy_page_c
91	.byte copy_page_c_end-copy_page_c
92	.previous
93
94	.section .altinstr_replacement,"ax"
95	copy_page_c:
96	movl $4096/8,%ecx	11	movl $4096/8,%ecx
97	rep	12	rep
98	movsq	13	movsq
99	ret	14	ret
100	copy_page_c_end:
101	.previous


diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index c6c46494fef5..92dd80544602 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
11	*	11	*
12	* Output:	12	* Output:
13	* rax original destination	13	* rax original destination
		14	*
		15	* TODO: check best memcpy for PSC
14	*/	16	*/
15		17
16	.globl __memcpy	18	.globl __memcpy
@@ -18,95 +20,6 @@
18	.p2align 4	20	.p2align 4
19	__memcpy:	21	__memcpy:
20	memcpy:	22	memcpy:
21	pushq %rbx
22	movq %rdi,%rax
23
24	movl %edx,%ecx
25	shrl $6,%ecx
26	jz .Lhandle_tail
27
28	.p2align 4
29	.Lloop_64:
30	decl %ecx
31
32	movq (%rsi),%r11
33	movq 8(%rsi),%r8
34
35	movq %r11,(%rdi)
36	movq %r8,1*8(%rdi)
37
38	movq 2*8(%rsi),%r9
39	movq 3*8(%rsi),%r10
40
41	movq %r9,2*8(%rdi)
42	movq %r10,3*8(%rdi)
43
44	movq 4*8(%rsi),%r11
45	movq 5*8(%rsi),%r8
46
47	movq %r11,4*8(%rdi)
48	movq %r8,5*8(%rdi)
49
50	movq 6*8(%rsi),%r9
51	movq 7*8(%rsi),%r10
52
53	movq %r9,6*8(%rdi)
54	movq %r10,7*8(%rdi)
55
56	leaq 64(%rsi),%rsi
57	leaq 64(%rdi),%rdi
58	jnz .Lloop_64
59
60	.Lhandle_tail:
61	movl %edx,%ecx
62	andl $63,%ecx
63	shrl $3,%ecx
64	jz .Lhandle_7
65	.p2align 4
66	.Lloop_8:
67	decl %ecx
68	movq (%rsi),%r8
69	movq %r8,(%rdi)
70	leaq 8(%rdi),%rdi
71	leaq 8(%rsi),%rsi
72	jnz .Lloop_8
73
74	.Lhandle_7:
75	movl %edx,%ecx
76	andl $7,%ecx
77	jz .Lende
78	.p2align 4
79	.Lloop_1:
80	movb (%rsi),%r8b
81	movb %r8b,(%rdi)
82	incq %rdi
83	incq %rsi
84	decl %ecx
85	jnz .Lloop_1
86
87	.Lende:
88	popq %rbx
89	ret
90	.Lfinal:
91
92	/* C stepping K8 run faster using the string copy instructions.
93	It is also a lot simpler. Use this when possible */
94
95	.section .altinstructions,"a"
96	.align 8
97	.quad memcpy
98	.quad memcpy_c
99	.byte X86_FEATURE_K8_C
100	.byte .Lfinal-memcpy
101	.byte memcpy_c_end-memcpy_c
102	.previous
103
104	.section .altinstr_replacement,"ax"
105	/* rdi destination
106	* rsi source
107	* rdx count
108	*/
109	memcpy_c:
110	movq %rdi,%rax	23	movq %rdi,%rax
111	movl %edx,%ecx	24	movl %edx,%ecx
112	shrl $3,%ecx	25	shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
117	rep	30	rep
118	movsb	31	movsb
119	ret	32	ret
120	memcpy_c_end:
121	.previous


diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 4b4c40638640..2aa48f24ed1e 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
13	.p2align 4	13	.p2align 4
14	memset:	14	memset:
15	__memset:	15	__memset:
16	movq %rdi,%r10
17	movq %rdx,%r11
18
19	/* expand byte value */
20	movzbl %sil,%ecx
21	movabs $0x0101010101010101,%rax
22	mul %rcx /* with rax, clobbers rdx */
23
24	/* align dst */
25	movl %edi,%r9d
26	andl $7,%r9d
27	jnz .Lbad_alignment
28	.Lafter_bad_alignment:
29
30	movl %r11d,%ecx
31	shrl $6,%ecx
32	jz .Lhandle_tail
33
34	.p2align 4
35	.Lloop_64:
36	decl %ecx
37	movq %rax,(%rdi)
38	movq %rax,8(%rdi)
39	movq %rax,16(%rdi)
40	movq %rax,24(%rdi)
41	movq %rax,32(%rdi)
42	movq %rax,40(%rdi)
43	movq %rax,48(%rdi)
44	movq %rax,56(%rdi)
45	leaq 64(%rdi),%rdi
46	jnz .Lloop_64
47
48	/* Handle tail in loops. The loops should be faster than hard
49	to predict jump tables. */
50	.p2align 4
51	.Lhandle_tail:
52	movl %r11d,%ecx
53	andl $63&(~7),%ecx
54	jz .Lhandle_7
55	shrl $3,%ecx
56	.p2align 4
57	.Lloop_8:
58	decl %ecx
59	movq %rax,(%rdi)
60	leaq 8(%rdi),%rdi
61	jnz .Lloop_8
62
63	.Lhandle_7:
64	movl %r11d,%ecx
65	andl $7,%ecx
66	jz .Lende
67	.p2align 4
68	.Lloop_1:
69	decl %ecx
70	movb %al,(%rdi)
71	leaq 1(%rdi),%rdi
72	jnz .Lloop_1
73
74	.Lende:
75	movq %r10,%rax
76	ret
77
78	.Lbad_alignment:
79	cmpq $7,%r11
80	jbe .Lhandle_7
81	movq %rax,(%rdi) /* unaligned store */
82	movq $8,%r8
83	subq %r9,%r8
84	addq %r8,%rdi
85	subq %r8,%r11
86	jmp .Lafter_bad_alignment
87
88	/* C stepping K8 run faster using the string instructions.
89	It is also a lot simpler. Use this when possible */
90
91	#include <asm/cpufeature.h>
92
93	.section .altinstructions,"a"
94	.align 8
95	.quad memset
96	.quad memset_c
97	.byte X86_FEATURE_K8_C
98	.byte memset_c_end-memset_c
99	.byte memset_c_end-memset_c
100	.previous
101
102	.section .altinstr_replacement,"ax"
103	/* rdi destination
104	* rsi value
105	* rdx count
106	*/
107	memset_c:
108	movq %rdi,%r9	16	movq %rdi,%r9
109	movl %edx,%r8d	17	movl %edx,%r8d
110	andl $7,%r8d	18	andl $7,%r8d
@@ -121,5 +29,3 @@ memset_c:
121	stosb	29	stosb
122	movq %r9,%rax	30	movq %r9,%rax
123	ret	31	ret
124	memset_c_end:
125	.previous